diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index c34b0c462..49139317c 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -2,6 +2,9 @@ name: continuous build on: [push, pull_request] +permissions: + contents: read # to fetch code (actions/checkout) + jobs: build: runs-on: ${{ matrix.os }} @@ -150,6 +153,7 @@ jobs: matrix: msystem: [MINGW64, MINGW32, CLANG64] idx: [int32, int64] + build-type: [Release] include: - msystem: MINGW64 idx: int32 @@ -173,6 +177,11 @@ jobs: idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 c-lapack-flags: -DC_LAPACK=ON + - msystem: MINGW64 + idx: int32 + target-prefix: mingw-w64-x86_64 + fc-pkg: mingw-w64-x86_64-gcc-fortran + build-type: None exclude: - msystem: MINGW32 idx: int64 @@ -215,11 +224,11 @@ jobs: path: C:/msys64/home/runneradmin/.ccache # We include the commit sha in the cache key, as new cache entries are # only created if there is no existing entry for the key yet. - key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }} + key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} # Restore a matching ccache cache entry. Prefer same branch. restore-keys: | - ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }} - ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }} + ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }} + ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }} - name: Configure ccache # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. @@ -235,7 +244,8 @@ jobs: - name: Configure OpenBLAS run: | mkdir build && cd build - cmake -DBUILD_SHARED_LIBS=ON \ + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \ + -DBUILD_SHARED_LIBS=ON \ -DBUILD_STATIC_LIBS=ON \ -DDYNAMIC_ARCH=ON \ -DUSE_THREAD=ON \ @@ -258,6 +268,7 @@ jobs: timeout-minutes: 60 run: cd build && ctest + cross_build: runs-on: ubuntu-22.04 @@ -267,7 +278,7 @@ jobs: include: - target: mips64el triple: mips64el-linux-gnuabi64 - opts: DYNAMIC_ARCH=1 + opts: DYNAMIC_ARCH=1 TARGET=GENERIC - target: riscv64 triple: riscv64-linux-gnu opts: TARGET=RISCV64_GENERIC diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml new file mode 100644 index 000000000..de7c0c0f3 --- /dev/null +++ b/.github/workflows/mips64.yml @@ -0,0 +1,117 @@ +name: mips64 qemu test + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + TEST: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: MIPS64_GENERIC + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=MIPS64_GENERIC + - target: SICORTEX + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=SICORTEX + - target: I6400 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6400 + - target: P6600 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=P6600 + - target: I6500 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6500 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross + + - name: checkout qemu + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 + + - name: build qemu + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system + make -j$(nproc) + make install + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-mips64el ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 29ec96f73..37ffe9e83 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -17,6 +17,10 @@ on: # it only makes sense to test if this file has been changed name: Nightly-Homebrew-Build + +permissions: + contents: read # to fetch code (actions/checkout) + jobs: build-OpenBLAS-with-Homebrew: runs-on: macos-latest diff --git a/.travis.yml b/.travis.yml index a4edad726..06db6a95c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,7 +30,7 @@ matrix: before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" script: - - travis_wait 40 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - travis_wait 50 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -104,7 +104,7 @@ matrix: - sudo apt-get update - sudo apt-get install gcc-9 gfortran-9 -y script: - - travis_wait 40 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - travis_wait 50 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -121,7 +121,7 @@ matrix: - sudo apt-get update - sudo apt-get install gcc-9 gfortran-9 -y script: - - travis_wait 40 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - travis_wait 50 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c92356e7..e830589e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,10 +212,10 @@ if(NOT NO_LAPACKE) add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) list(APPEND TARGET_OBJS "$") endif() -if(BUILD_RELAPACK) - add_library(RELAPACK OBJECT ${RELA_SOURCES}) - list(APPEND TARGET_OBJS "$") -endif() +#if(BUILD_RELAPACK) +# add_library(RELAPACK OBJECT ${RELA_SOURCES}) +# list(APPEND TARGET_OBJS "$") +#endif() set(OpenBLAS_LIBS "") if(BUILD_STATIC_LIBS) add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 1714d90c8..f5e9dda91 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -211,4 +211,5 @@ In chronological order: * PLCT Lab, Institute of Software Chinese Academy of Sciences * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. - \ No newline at end of file +* Pablo Romero + * [2022-08] Fix building from sources for QNX \ No newline at end of file diff --git a/Makefile b/Makefile index 289f0eca5..56af9847e 100644 --- a/Makefile +++ b/Makefile @@ -278,7 +278,11 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(F_COMPILER), GFORTRAN) + -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.arm64 b/Makefile.arm64 index 4efa55286..fc986f4c0 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -70,12 +70,12 @@ endif ifeq ($(CORE), NEOVERSEN1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ9), 1) -CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=neoverse-n1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 endif else -CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif @@ -89,17 +89,17 @@ endif endif # Use a72 tunings because Neoverse-V1 is only available -# in GCC>=9.4 +# in GCC>=10.4 ifeq ($(CORE), NEOVERSEV1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) -CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +ifeq ($(GCCVERSIONGTEQ10), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) +CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a -mtune=native +CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=native endif @@ -119,17 +119,21 @@ endif endif # Use a72 tunings because Neoverse-N2 is only available -# in GCC>=9.4 +# in GCC>=10.4 ifeq ($(CORE), NEOVERSEN2) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +ifeq ($(GCCVERSIONGTEQ10), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) +ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a -mtune=native +CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a -mtune=native endif diff --git a/Makefile.install b/Makefile.install index 28727de37..87b5bc870 100644 --- a/Makefile.install +++ b/Makefile.install @@ -14,6 +14,11 @@ OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig PKG_EXTRALIB := $(EXTRALIB) +ifeq ($(INTERFACE64),1) + SUFFIX64=64 +endif +PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" + ifeq ($(USE_OPENMP), 1) ifeq ($(C_COMPILER), PGI) PKG_EXTRALIB += -lomp @@ -150,13 +155,19 @@ endif endif #Generating openblas.pc - @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" +ifeq ($(INTERFACE64),1) + SUFFIX64=64 +endif + PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" + + @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" + @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" + @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" + @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" + @echo 'version='$(VERSION) >> "$(PKGFILE)" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" + @cat openblas.pc.in >> "$(PKGFILE)" #Generating OpenBLASConfig.cmake diff --git a/Makefile.prebuild b/Makefile.prebuild index 5dd7dfa4e..0be4f1274 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -60,9 +60,9 @@ all: getarch_2nd ./getarch_2nd 1 >> $(TARGET_CONF) $(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch - ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" + ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" $(TARGET_FLAGS) $(CFLAGS) ifneq ($(ONLY_CBLAS), 1) - ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" "$(TARGET_FLAGS)" + ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" $(TARGET_FLAGS) else #When we only build CBLAS, we set NOFORTRAN=2 echo "NOFORTRAN=2" >> $(TARGET_MAKE) @@ -77,8 +77,8 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_AVX512); \ - rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_RV64GV); \ + avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ + rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \ $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy diff --git a/Makefile.rule b/Makefile.rule index 359672359..5e6cefc22 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -131,6 +131,9 @@ BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 +# Have RecursiveLAPACK actually replace standard LAPACK routines instead of +# just adding its equivalents with a RELAPACK_ prefix +# RELAPACK_REPLACE = 1 # If you want to use the legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -207,7 +210,7 @@ NO_AFFINITY = 1 # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 -# If you need to synchronize FP CSR between threads (for x86/x86_64 only). +# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). # CONSISTENT_FPCSR = 1 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute diff --git a/Makefile.system b/Makefile.system index 10b952d4b..3c29ab3f3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,10 @@ ifndef TOPDIR TOPDIR = . endif +ifndef RELAPACK_REPLACE +RELAPACK_REPLACE=0 +endif + # we need to use the host system's architecture for getarch compile options even especially when cross-compiling HOSTARCH := $(shell uname -m) ifeq ($(HOSTARCH), amd64) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f14a8a8ff..d5e9cbfc7 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -143,6 +143,7 @@ ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -mavx2 endif endif +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) ifeq ($(F_COMPILER), GFORTRAN) # AVX2 support was added in 4.7.0 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) @@ -159,6 +160,7 @@ endif endif endif endif +endif endif diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1102bf0f5..16b9da4f5 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -141,7 +141,7 @@ jobs: - job: OSX_OpenMP pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update @@ -151,15 +151,23 @@ jobs: - job: OSX_GCC_Nothreads pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 +- job: OSX_GCC12 + pool: + vmImage: 'macOS-latest' + steps: + - script: | + brew update + make CC=gcc-12 FC=gfortran-12 + - job: OSX_OpenMP_Clang pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -172,7 +180,7 @@ jobs: - job: OSX_OpenMP_Clang_cmake pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -188,7 +196,7 @@ jobs: - job: OSX_dynarch_cmake pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -196,13 +204,13 @@ jobs: - script: | mkdir build cd build - cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. + cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. cmake --build . ctest - job: OSX_Ifort_Clang pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg @@ -235,7 +243,7 @@ jobs: - job: OSX_NDK_ARMV7 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update @@ -255,7 +263,7 @@ jobs: - job: OSX_IOS_ARMV7 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 diff --git a/benchmark/Makefile b/benchmark/Makefile index f2f3b354a..d9ddb9042 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1,3439 +1,3439 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -# ACML standard -#ACML=/opt/acml5.3.1/gfortran64_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML custom -#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML 6.1 custom -ACML=/home/saar/acml6.1/gfortran64_mp/lib -LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - - -# Atlas Ubuntu -#ATLAS=/usr/lib/atlas-base -#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Atlas RHEL and Fedora -ATLAS=/usr/lib64/atlas -LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Intel standard -# MKL=/opt/intel/mkl/lib/intel64 -# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Intel custom -MKL=/home/saar/intel_mkl -LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Apple vecLib -LIBVECLIB = -framework Accelerate - -ESSL=/opt/ibm/lib -#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a -LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a - -ifneq ($(NO_LAPACK), 1) -GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - csymv.goto zsymv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto -else -GOTO_LAPACK_TARGETS= -endif - -ifeq ($(BUILD_BFLOAT16),1) -GOTO_HALF_TARGETS=sbgemm.goto -else -GOTO_HALF_TARGETS= -endif - -ifeq ($(OSNAME), WINNT) - -goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto csymv.goto zsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - cher.goto zher.goto \ - cher2.goto zher2.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ - stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - cher.acml zher.acml \ - cher2.acml zher2.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ - stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - cher.atlas zher.atlas \ - cher2.atlas zher2.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ - stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl \ - srot.mkl drot.mkl csrot.mkl zdrot.mkl \ - srotm.mkl drotm.mkl \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - cher.mkl zher.mkl \ - cher2.mkl zher2.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ - stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl - -else - -goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - cher.goto zher.goto \ - cher2.goto zher2.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ - stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling \ - isamax.goto idamax.goto icamax.goto izamax.goto \ - ismax.goto idmax.goto \ - isamin.goto idamin.goto icamin.goto izamin.goto \ - ismin.goto idmin.goto \ - samax.goto damax.goto camax.goto zamax.goto \ - smax.goto dmax.goto \ - samin.goto damin.goto camin.goto zamin.goto \ - smin.goto dmin.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - cher.acml zher.acml \ - cher2.acml zher2.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ - stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - cher.atlas zher.atlas \ - cher2.atlas zher2.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ - stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - cher.mkl zher.mkl \ - cher2.mkl zher2.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ - stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl - - - - -endif - -essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ - cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ - slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ - scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ - strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl - -veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ - scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ - sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ - strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ - strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ - sspr.veclib dspr.veclib \ - sspr2.veclib dspr2.veclib \ - ssyr.veclib dsyr.veclib \ - ssyr2.veclib dsyr2.veclib \ - ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ - ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ - sger.veclib dger.veclib cger.veclib zger.veclib \ - sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ - srot.veclib drot.veclib csrot.veclib zdrot.veclib \ - srotm.veclib drotm.veclib \ - saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ - scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ - sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ - sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ - sasum.veclib dasum.veclib casum.veclib zasum.veclib \ - ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ - chemv.veclib zhemv.veclib \ - chbmv.veclib zhbmv.veclib \ - chpmv.veclib zhpmv.veclib \ - chemm.veclib zhemm.veclib \ - cherk.veclib zherk.veclib \ - cher2k.veclib zher2k.veclib \ - cher.veclib zher.veclib \ - cher2.veclib zher2.veclib \ - sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ - stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ - stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ - strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ - sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ - sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ - sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ - spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ - saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib - -goto_3m :: cgemm3m.goto zgemm3m.goto - -mkl_3m :: cgemm3m.mkl zgemm3m.mkl - -all :: goto mkl atlas acml veclib - -exe : - @./Make_exe.sh - -##################################### Slinpack #################################################### -slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -slinpack.acml : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.atlas : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.mkl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.veclib : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.essl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dlinpack #################################################### -dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dlinpack.acml : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.atlas : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.mkl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.veclib : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.essl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Clinpack #################################################### - -clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -clinpack.acml : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.atlas : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.mkl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.veclib : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.essl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zlinpack #################################################### - -zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zlinpack.acml : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.atlas : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.mkl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.veclib : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.essl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scholesky ################################################### - -scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scholesky.acml : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.atlas : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.mkl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.veclib : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.essl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcholesky ################################################### - -dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcholesky.acml : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.atlas : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.mkl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.veclib : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.essl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccholesky ################################################### - -ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccholesky.acml : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.atlas : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.mkl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.veclib : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.essl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Zcholesky ################################################### - -zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcholesky.acml : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.atlas : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.mkl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.veclib : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.essl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemm #################################################### -ifeq ($(BUILD_BFLOAT16),1) -sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -endif - -sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemm.acml : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.atlas : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.mkl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.veclib : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.essl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemm #################################################### -dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemm.acml : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.atlas : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.mkl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.veclib : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.essl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemm #################################################### - -cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm.acml : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.atlas : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.mkl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.veclib : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.essl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm #################################################### - -zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm.acml : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.atlas : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.mkl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.veclib : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.essl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymm #################################################### -ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymm.acml : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.atlas : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.mkl : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.veclib : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymm #################################################### -dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymm.acml : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.atlas : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.mkl : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.veclib : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymm #################################################### - -csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymm.acml : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.atlas : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.mkl : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.veclib : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsymm #################################################### - -zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymm.acml : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.atlas : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.mkl : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.veclib : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmm #################################################### -strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmm.acml : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.atlas : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.mkl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.veclib : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.essl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmm #################################################### -dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmm.acml : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.atlas : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.mkl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.veclib : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.essl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmm #################################################### - -ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmm.acml : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.atlas : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.mkl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.veclib : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.essl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmm #################################################### - -ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmm.acml : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.atlas : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.mkl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.veclib : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.essl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsm #################################################### -strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsm.acml : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.atlas : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.mkl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.veclib : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.essl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsm #################################################### -dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsm.acml : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.atlas : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.mkl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.veclib : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.essl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsm #################################################### - -ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsm.acml : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.atlas : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.mkl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.veclib : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.essl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsm #################################################### - -ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsm.acml : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.atlas : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.mkl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.veclib : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.essl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ssyr #################################################### -ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr.acml : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.atlas : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.mkl : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.veclib : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr #################################################### -dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr.acml : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.atlas : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.mkl : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.veclib : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr #################################################### -sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr.acml : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.atlas : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.mkl : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.veclib : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr #################################################### -dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr.acml : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.atlas : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.mkl : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.veclib : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr2 #################################################### -sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr2.acml : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.atlas : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.mkl : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.veclib : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr2 #################################################### -dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr2.acml : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.atlas : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.mkl : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.veclib : dspr2.$(SUFFIX) - -##################################### Ssyr2 #################################################### -ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2.acml : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.atlas : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.mkl : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.veclib : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr2 #################################################### -dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2.acml : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.atlas : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.mkl : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.veclib : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyrk #################################################### -ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyrk.acml : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.atlas : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.mkl : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.veclib : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyrk #################################################### -dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyrk.acml : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.atlas : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.mkl : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.veclib : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyrk #################################################### - -csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyrk.acml : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.atlas : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.mkl : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.veclib : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyrk #################################################### - -zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyrk.acml : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.atlas : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.mkl : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.veclib : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyr2k #################################################### -ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2k.acml : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.atlas : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.mkl : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.veclib : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyr2k #################################################### -dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2k.acml : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.atlas : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.mkl : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.veclib : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyr2k #################################################### - -csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyr2k.acml : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.atlas : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.mkl : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.veclib : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyr2k #################################################### - -zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyr2k.acml : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.atlas : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.mkl : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.veclib : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemm #################################################### - -chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemm.acml : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.atlas : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.mkl : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.veclib : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemm #################################################### - -zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemm.acml : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.atlas : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.mkl : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.veclib : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cherk #################################################### - -cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cherk.acml : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.atlas : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.mkl : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.veclib : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zherk #################################################### - -zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zherk.acml : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.atlas : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.mkl : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.veclib : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher2k #################################################### - -cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher2k.acml : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.atlas : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.mkl : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.veclib : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher2k #################################################### - -zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher2k.acml : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.atlas : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.mkl : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.veclib : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher #################################################### - -cher.goto : cher.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher.acml : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.atlas : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.mkl : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.veclib : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher #################################################### - -zher.goto : zher.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher.acml : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.atlas : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.mkl : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.veclib : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher2 #################################################### - -cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher2.acml : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.atlas : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.mkl : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.veclib : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher2 #################################################### - -zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher2.acml : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.atlas : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.mkl : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.veclib : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemv.acml : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.atlas : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.mkl : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.veclib : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemv.acml : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.atlas : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.mkl : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.veclib : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemv.acml : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.atlas : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.mkl : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.veclib : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemv.acml : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.atlas : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.mkl : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.veclib : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspmv #################################################### -sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspmv.atlas : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspmv #################################################### -dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspmv.atlas : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmv #################################################### -strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmv.acml : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.atlas : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.mkl : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.veclib : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmv #################################################### -dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmv.acml : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.atlas : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.mkl : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.veclib : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmv #################################################### - -ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmv.acml : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.atlas : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.mkl : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.veclib : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmv #################################################### - -ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmv.acml : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.atlas : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.mkl : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.veclib : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Stpmv #################################################### -stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -stpmv.acml : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.atlas : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.mkl : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.veclib : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtpmv #################################################### -dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtpmv.acml : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.atlas : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.mkl : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.veclib : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctpmv #################################################### - -ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctpmv.acml : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.atlas : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.mkl : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.veclib : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztpmv #################################################### - -ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztpmv.acml : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.atlas : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.mkl : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.veclib : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Stpsv #################################################### -stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -stpsv.acml : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.atlas : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.mkl : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.veclib : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtpsv #################################################### -dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtpsv.acml : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.atlas : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.mkl : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.veclib : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctpsv #################################################### - -ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctpsv.acml : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.atlas : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.mkl : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.veclib : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztpsv #################################################### - -ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztpsv.acml : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.atlas : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.mkl : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.veclib : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsv #################################################### -strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsv.acml : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.atlas : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.mkl : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.veclib : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsv #################################################### -dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsv.acml : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.atlas : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.mkl : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.veclib : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsv #################################################### - -ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsv.acml : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.atlas : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.mkl : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.veclib : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsv #################################################### - -ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsv.acml : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.atlas : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.mkl : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.veclib : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sger #################################################### -sger.goto : sger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sger.acml : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.atlas : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.mkl : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.veclib : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dger #################################################### -dger.goto : dger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dger.acml : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.atlas : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.mkl : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.veclib : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cger #################################################### -cger.goto : cger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cger.acml : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.atlas : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.mkl : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.veclib : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zger #################################################### -zger.goto : zger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zger.acml : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.atlas : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.mkl : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.veclib : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymv #################################################### -ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymv.acml : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.atlas : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.mkl : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.veclib : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymv.acml : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.atlas : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.mkl : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.veclib : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymv #################################################### -csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymv.acml : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.atlas : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.mkl : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.veclib : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymv.acml : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.atlas : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.mkl : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.veclib : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgeev #################################################### -sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgeev.acml : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.atlas : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.mkl : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.veclib : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgeev #################################################### -dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgeev.acml : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.atlas : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.mkl : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.veclib : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgeev #################################################### - -cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgeev.acml : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.atlas : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.mkl : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.veclib : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgeev #################################################### - -zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgeev.acml : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.atlas : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.mkl : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.veclib : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgetri #################################################### -sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgetri.acml : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.atlas : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.mkl : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.veclib : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgetri #################################################### -dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgetri.acml : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.atlas : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.mkl : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.veclib : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgetri #################################################### - -cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgetri.acml : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.atlas : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.mkl : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.veclib : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgetri #################################################### - -zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgetri.acml : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.atlas : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.mkl : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.veclib : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Spotrf #################################################### -spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -spotrf.acml : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.atlas : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.mkl : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.veclib : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dpotrf #################################################### -dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dpotrf.acml : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.atlas : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.mkl : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.veclib : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cpotrf #################################################### - -cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cpotrf.acml : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.atlas : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.mkl : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.veclib : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zpotrf #################################################### - -zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zpotrf.acml : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.atlas : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.mkl : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.veclib : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemv #################################################### - -chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemv.acml : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.atlas : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.mkl : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.veclib : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemv #################################################### - -zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemv.acml : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.atlas : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.mkl : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.veclib : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chbmv #################################################### - -chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chbmv.acml : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.atlas : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.mkl : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.veclib : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhbmv #################################################### - -zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhbmv.acml : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.atlas : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.mkl : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.veclib : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chpmv #################################################### - -chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chpmv.acml : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.atlas : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.mkl : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.veclib : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhpmv #################################################### - -zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhpmv.acml : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.atlas : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.mkl : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.veclib : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sdot.acml : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.atlas : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.mkl : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.veclib : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ddot.acml : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.atlas : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.mkl : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.veclib : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cdot.acml : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.atlas : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.mkl : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.veclib : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdot.acml : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.atlas : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.mkl : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.veclib : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srot.acml : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.atlas : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.mkl : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.veclib : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drot.acml : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.atlas : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.mkl : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.veclib : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### csrot #################################################### -csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csrot.acml : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.atlas : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.mkl : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.veclib : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### zdrot #################################################### -zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdrot.acml : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.atlas : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.mkl : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.veclib : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### srotm #################################################### -srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srotm.acml : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.atlas : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.mkl : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.veclib : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### drotm #################################################### -drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drotm.acml : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.atlas : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.mkl : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.veclib : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpy.acml : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.atlas : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.mkl : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.veclib : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpy.acml : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.atlas : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.mkl : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.veclib : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpy.acml : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.atlas : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.mkl : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.veclib : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpy.acml : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.atlas : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.mkl : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.veclib : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpby #################################################### -saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpby.acml : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.atlas : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.mkl : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.veclib : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpby #################################################### -daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpby.acml : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.atlas : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.mkl : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.veclib : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpby #################################################### - -caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpby.acml : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.atlas : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.mkl : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.veclib : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpby #################################################### - -zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpby.acml : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.atlas : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.mkl : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.veclib : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scopy.acml : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.atlas : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.mkl : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.veclib : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcopy.acml : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.atlas : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.mkl : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.veclib : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccopy.acml : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.atlas : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.mkl : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.veclib : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcopy.acml : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.atlas : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.mkl : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.veclib : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sscal.acml : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.atlas : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.mkl : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.veclib : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dscal.acml : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.atlas : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.mkl : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.veclib : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cscal.acml : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.atlas : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.mkl : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.veclib : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zscal.acml : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.atlas : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.mkl : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.veclib : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sasum.acml : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.atlas : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.mkl : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.veclib : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dasum.acml : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.atlas : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.mkl : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.veclib : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -casum.acml : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.atlas : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.mkl : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.veclib : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zasum.acml : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.atlas : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.mkl : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.veclib : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sswap.acml : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.atlas : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.mkl : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.veclib : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dswap.acml : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.atlas : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.mkl : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.veclib : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cswap.acml : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.atlas : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.mkl : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.veclib : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zswap.acml : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.atlas : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.mkl : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.veclib : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Sgesv #################################################### -sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgesv.acml : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.atlas : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.mkl : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.veclib : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgesv #################################################### -dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgesv.acml : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.atlas : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.mkl : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.veclib : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgesv #################################################### - -cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgesv.acml : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.atlas : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.mkl : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.veclib : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgesv #################################################### - -zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgesv.acml : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.atlas : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.mkl : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.veclib : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Cgemm3m #################################################### - -cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm3m.mkl : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm3m.veclib : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm3m #################################################### - -zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm3m.mkl : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm3m.veclib : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -isamax.atlas : isamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -idamax.atlas : idamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -icamax.atlas : icamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -izamax.atlas : izamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ISMAX ############################################## -ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMAX ############################################## -idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMIN ############################################## -isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMIN ############################################## -idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMIN ############################################## -icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMIN ############################################## -izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMIN ############################################## -ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMIN ############################################## -idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMAX ############################################## -samax.goto : samax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMAX ############################################## -damax.goto : damax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMAX ############################################## -camax.goto : camax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMAX ############################################## -zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMAX ############################################## -smax.goto : smax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMAX ############################################## -dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMIN ############################################## -samin.goto : samin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMIN ############################################## -damin.goto : damin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMIN ############################################## -smin.goto : smin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMIN ############################################## -dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SNRM2 ############################################## -snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -snrm2.atlas : snrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## DNRM2 ############################################## -dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dnrm2.atlas : dnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Sscnrm2 ############################################## -scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scnrm2.atlas : scnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Ddznrm2 ############################################## -dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dznrm2.atlas : dznrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -################################################################################################### - -slinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -clinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ifeq ($(BUILD_BFLOAT16),1) -sbgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ -endif - -sgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher.$(SUFFIX) : her.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher.$(SUFFIX) : her.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher2.$(SUFFIX) : her2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher2.$(SUFFIX) : her2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -strmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -stpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -stpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ssymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -spotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -sgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - - -cgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ismax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -isamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ismin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -samax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -samin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -snrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -scnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -dznrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smallscaling: smallscaling.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread - -clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling - -include $(TOPDIR)/Makefile.tail +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +# ACML standard +#ACML=/opt/acml5.3.1/gfortran64_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML custom +#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML 6.1 custom +ACML=/home/saar/acml6.1/gfortran64_mp/lib +LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + + +# Atlas Ubuntu +#ATLAS=/usr/lib/atlas-base +#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Atlas RHEL and Fedora +ATLAS=/usr/lib64/atlas +LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Intel standard +# MKL=/opt/intel/mkl/lib/intel64 +# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Intel custom +MKL=/home/saar/intel_mkl +LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Apple vecLib +LIBVECLIB = -framework Accelerate + +ESSL=/opt/ibm/lib +#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a + +ifneq ($(NO_LAPACK), 1) +GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + csymv.goto zsymv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto +else +GOTO_LAPACK_TARGETS= +endif + +ifeq ($(BUILD_BFLOAT16),1) +GOTO_HALF_TARGETS=sbgemm.goto +else +GOTO_HALF_TARGETS= +endif + +ifeq ($(OSNAME), WINNT) + +goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sspmv.atlas dspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl \ + srot.mkl drot.mkl csrot.mkl zdrot.mkl \ + srotm.mkl drotm.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + +else + +goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto cdot.goto zdot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling \ + isamax.goto idamax.goto icamax.goto izamax.goto \ + ismax.goto idmax.goto \ + isamin.goto idamin.goto icamin.goto izamin.goto \ + ismin.goto idmin.goto \ + samax.goto damax.goto camax.goto zamax.goto \ + smax.goto dmax.goto \ + samin.goto damin.goto camin.goto zamin.goto \ + smin.goto dmin.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sspmv.atlas dspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ + snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + + + + +endif + +essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ + cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ + scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ + strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl + +veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ + scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ + sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ + strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ + strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ + ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ + ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ + ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ + sger.veclib dger.veclib cger.veclib zger.veclib \ + sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srot.veclib drot.veclib csrot.veclib zdrot.veclib \ + srotm.veclib drotm.veclib \ + saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ + scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ + sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ + sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ + sasum.veclib dasum.veclib casum.veclib zasum.veclib \ + ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ + chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ + chemm.veclib zhemm.veclib \ + cherk.veclib zherk.veclib \ + cher2k.veclib zher2k.veclib \ + cher.veclib zher.veclib \ + cher2.veclib zher2.veclib \ + sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ + stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ + sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ + sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ + sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ + spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ + saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib + +goto_3m :: cgemm3m.goto zgemm3m.goto + +mkl_3m :: cgemm3m.mkl zgemm3m.mkl + +all :: goto mkl atlas acml veclib + +exe : + @./Make_exe.sh + +##################################### Slinpack #################################################### +slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +slinpack.acml : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.atlas : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.mkl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.veclib : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.essl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dlinpack #################################################### +dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dlinpack.acml : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.atlas : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.mkl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.veclib : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.essl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Clinpack #################################################### + +clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +clinpack.acml : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.atlas : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.mkl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.veclib : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.essl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zlinpack #################################################### + +zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zlinpack.acml : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.atlas : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.mkl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.veclib : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.essl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scholesky ################################################### + +scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scholesky.acml : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.atlas : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.mkl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.veclib : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.essl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcholesky ################################################### + +dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcholesky.acml : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.atlas : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.mkl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.veclib : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.essl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccholesky ################################################### + +ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccholesky.acml : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.atlas : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.mkl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.veclib : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.essl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Zcholesky ################################################### + +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcholesky.acml : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.atlas : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.mkl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.veclib : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.essl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemm #################################################### +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + +sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemm.acml : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.atlas : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.mkl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.veclib : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.essl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemm #################################################### +dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemm.acml : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.atlas : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.mkl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.veclib : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.essl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemm #################################################### + +cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm.acml : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.atlas : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.mkl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.veclib : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.essl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm #################################################### + +zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm.acml : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.atlas : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.mkl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.veclib : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.essl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymm #################################################### +ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymm.acml : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.atlas : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.mkl : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.veclib : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymm #################################################### +dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymm.acml : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.atlas : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.mkl : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.veclib : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymm #################################################### + +csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymm.acml : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.atlas : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.mkl : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.veclib : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsymm #################################################### + +zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymm.acml : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.atlas : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.mkl : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.veclib : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmm #################################################### +strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmm.acml : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.atlas : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.mkl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.veclib : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.essl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmm #################################################### +dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmm.acml : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.atlas : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.mkl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.veclib : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.essl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmm #################################################### + +ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmm.acml : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.atlas : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.mkl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.veclib : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.essl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmm #################################################### + +ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmm.acml : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.atlas : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.mkl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.veclib : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.essl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsm #################################################### +strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsm.acml : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.atlas : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.mkl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.veclib : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.essl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsm #################################################### +dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsm.acml : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.atlas : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.mkl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.veclib : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.essl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsm #################################################### + +ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsm.acml : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.atlas : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.mkl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.veclib : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.essl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsm #################################################### + +ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsm.acml : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.atlas : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.mkl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.veclib : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.essl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyrk #################################################### +ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyrk.acml : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.atlas : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.mkl : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.veclib : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyrk #################################################### +dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyrk.acml : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.atlas : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.mkl : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.veclib : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyrk #################################################### + +csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyrk.acml : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.atlas : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.mkl : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.veclib : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyrk #################################################### + +zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyrk.acml : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.atlas : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.mkl : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.veclib : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyr2k #################################################### +ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2k.acml : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.atlas : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.mkl : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.veclib : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyr2k #################################################### +dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2k.acml : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.atlas : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.mkl : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.veclib : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyr2k #################################################### + +csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyr2k.acml : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.atlas : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.mkl : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.veclib : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyr2k #################################################### + +zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyr2k.acml : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.atlas : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.mkl : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.veclib : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemm #################################################### + +chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemm.acml : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.atlas : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.mkl : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.veclib : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemm #################################################### + +zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemm.acml : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.atlas : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.mkl : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.veclib : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cherk #################################################### + +cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cherk.acml : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.atlas : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.mkl : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.veclib : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zherk #################################################### + +zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zherk.acml : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.atlas : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.mkl : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.veclib : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2k #################################################### + +cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2k.acml : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.atlas : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.mkl : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.veclib : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2k #################################################### + +zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2k.acml : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.atlas : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.mkl : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.veclib : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher #################################################### + +cher.goto : cher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher.acml : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.atlas : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.mkl : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.veclib : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher #################################################### + +zher.goto : zher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher.acml : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.atlas : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.mkl : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.veclib : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2 #################################################### + +cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2.acml : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.atlas : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.mkl : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.veclib : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2 #################################################### + +zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2.acml : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.atlas : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.mkl : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.veclib : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemv.acml : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.atlas : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.mkl : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.veclib : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemv.acml : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.atlas : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.mkl : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.veclib : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemv.acml : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.atlas : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.mkl : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.veclib : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemv.acml : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.atlas : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.mkl : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.veclib : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspmv #################################################### +sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspmv.atlas : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspmv #################################################### +dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspmv.atlas : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Stpmv #################################################### +stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpmv.acml : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.atlas : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.mkl : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.veclib : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpmv #################################################### +dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpmv.acml : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.atlas : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.mkl : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.veclib : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpmv #################################################### + +ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpmv.acml : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.atlas : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.mkl : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.veclib : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpmv #################################################### + +ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpmv.acml : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.atlas : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.mkl : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.veclib : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Stpsv #################################################### +stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpsv.acml : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.atlas : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.mkl : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.veclib : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpsv #################################################### +dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpsv.acml : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.atlas : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.mkl : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.veclib : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpsv #################################################### + +ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpsv.acml : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.atlas : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.mkl : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.veclib : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpsv #################################################### + +ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpsv.acml : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.atlas : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.mkl : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.veclib : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsv.acml : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.atlas : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.mkl : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sger #################################################### +sger.goto : sger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sger.acml : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.atlas : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.mkl : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.veclib : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dger #################################################### +dger.goto : dger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dger.acml : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.atlas : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.mkl : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.veclib : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cger #################################################### +cger.goto : cger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cger.acml : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.atlas : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.mkl : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.veclib : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zger #################################################### +zger.goto : zger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zger.acml : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.atlas : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.mkl : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.veclib : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymv #################################################### +ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymv.acml : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.atlas : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.mkl : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.veclib : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymv.acml : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.atlas : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.mkl : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.veclib : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymv #################################################### +csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymv.acml : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.atlas : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.mkl : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.veclib : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymv.acml : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.atlas : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.mkl : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.veclib : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgeev #################################################### +sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgeev.acml : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.atlas : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.mkl : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.veclib : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgeev #################################################### +dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgeev.acml : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.atlas : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.mkl : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.veclib : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgeev #################################################### + +cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgeev.acml : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.atlas : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.mkl : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.veclib : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgeev #################################################### + +zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgeev.acml : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.atlas : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.mkl : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.veclib : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgetri #################################################### +sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgetri.acml : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.atlas : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.mkl : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.veclib : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgetri #################################################### +dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgetri.acml : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.atlas : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.mkl : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.veclib : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgetri #################################################### + +cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgetri.acml : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.atlas : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.mkl : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.veclib : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgetri #################################################### + +zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgetri.acml : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.atlas : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.mkl : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.veclib : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Spotrf #################################################### +spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +spotrf.acml : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.atlas : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.mkl : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.veclib : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dpotrf #################################################### +dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dpotrf.acml : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.atlas : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.mkl : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.veclib : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cpotrf #################################################### + +cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cpotrf.acml : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.atlas : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.mkl : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.veclib : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zpotrf #################################################### + +zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zpotrf.acml : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.atlas : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.mkl : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.veclib : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemv #################################################### + +chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemv.acml : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.atlas : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.mkl : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.veclib : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemv #################################################### + +zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemv.acml : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.atlas : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.mkl : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.veclib : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### + +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### + +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sdot.acml : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.atlas : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.mkl : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.veclib : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ddot.acml : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.atlas : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.mkl : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.veclib : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cdot.acml : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.atlas : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.mkl : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.veclib : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdot.acml : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.atlas : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.mkl : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.veclib : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srot.acml : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.atlas : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.mkl : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.veclib : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drot.acml : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.atlas : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.mkl : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.veclib : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### csrot #################################################### +csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csrot.acml : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.atlas : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.mkl : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.veclib : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### zdrot #################################################### +zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdrot.acml : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.atlas : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.mkl : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.veclib : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpy.acml : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.atlas : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.mkl : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.veclib : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpy.acml : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.atlas : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.mkl : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.veclib : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpy.acml : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.atlas : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.mkl : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.veclib : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpy.acml : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.atlas : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.mkl : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.veclib : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpby #################################################### +saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpby.acml : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.atlas : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.mkl : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.veclib : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpby #################################################### +daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpby.acml : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.atlas : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.mkl : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.veclib : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpby #################################################### + +caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpby.acml : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.atlas : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.mkl : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.veclib : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpby #################################################### + +zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpby.acml : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.atlas : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.mkl : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.veclib : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scopy.acml : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.atlas : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.mkl : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.veclib : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcopy.acml : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.atlas : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.mkl : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.veclib : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccopy.acml : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.atlas : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.mkl : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.veclib : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcopy.acml : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.atlas : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.mkl : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.veclib : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sscal.acml : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.atlas : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.mkl : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.veclib : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dscal.acml : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.atlas : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.mkl : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.veclib : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cscal.acml : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.atlas : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.mkl : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.veclib : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zscal.acml : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.atlas : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.mkl : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.veclib : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sasum.acml : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.atlas : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.mkl : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.veclib : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dasum.acml : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.atlas : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.mkl : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.veclib : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +casum.acml : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.atlas : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.mkl : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.veclib : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zasum.acml : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.atlas : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.mkl : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.veclib : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sswap.acml : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.atlas : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.mkl : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.veclib : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dswap.acml : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.atlas : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.mkl : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.veclib : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cswap.acml : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.atlas : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.mkl : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.veclib : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zswap.acml : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.atlas : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.mkl : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.veclib : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Sgesv #################################################### +sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgesv.acml : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.atlas : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.mkl : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.veclib : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgesv #################################################### +dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgesv.acml : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.atlas : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.mkl : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.veclib : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgesv #################################################### + +cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgesv.acml : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.atlas : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.mkl : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.veclib : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgesv #################################################### + +zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgesv.acml : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.atlas : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.mkl : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.veclib : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Cgemm3m #################################################### + +cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm3m.mkl : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm3m.veclib : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm3m #################################################### + +zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm3m.mkl : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm3m.veclib : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +isamax.atlas : isamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +idamax.atlas : idamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +icamax.atlas : icamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +izamax.atlas : izamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMAX ############################################## +camax.goto : camax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMAX ############################################## +zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SNRM2 ############################################## +snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +snrm2.atlas : snrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## DNRM2 ############################################## +dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dnrm2.atlas : dnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Sscnrm2 ############################################## +scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scnrm2.atlas : scnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Ddznrm2 ############################################## +dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dznrm2.atlas : dznrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +################################################################################################### + +slinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +clinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + +sgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +stpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +stpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ssymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +spotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +sgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + + +cgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +snrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +scnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +dznrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread + +clean :: + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling + +include $(TOPDIR)/Makefile.tail diff --git a/benchmark/amax.c b/benchmark/amax.c index 29310dd71..446ba4c07 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -1,133 +1,133 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef AMAX - -#ifdef COMPLEX -#ifdef DOUBLE -#define AMAX BLASFUNC(dzamax) -#else -#define AMAX BLASFUNC(scamax) -#endif -#else -#ifdef DOUBLE -#define AMAX BLASFUNC(damax) -#else -#define AMAX BLASFUNC(samax) -#endif -#endif - -int main(int argc, char *argv[]) -{ - - FLOAT *x; - blasint m, i; - blasint inc_x = 1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1, timeg; - - argc--; - argv++; - - if (argc > 0) - { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) - { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) - { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) - { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) - { - - timeg = 0; - fprintf(stderr, " %6d : ", (int)m); - - for (l = 0; l < loops; l++) - { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) - { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - begin(); - AMAX(&m, x, &inc_x); - end(); - timeg += getsec(); - } - - timeg /= loops; - - fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef AMAX + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMAX BLASFUNC(dzamax) +#else +#define AMAX BLASFUNC(scamax) +#endif +#else +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x; + blasint m, i; + blasint inc_x = 1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) + { + + timeg = 0; + fprintf(stderr, " %6d : ", (int)m); + + for (l = 0; l < loops; l++) + { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + begin(); + AMAX(&m, x, &inc_x); + end(); + timeg += getsec(); + } + + timeg /= loops; + + fprintf(stderr, + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/amin.c b/benchmark/amin.c index 54a1d266a..44f15a7f8 100644 --- a/benchmark/amin.c +++ b/benchmark/amin.c @@ -1,137 +1,137 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef AMIN - -#ifdef COMPLEX -#ifdef DOUBLE -#define AMIN BLASFUNC(dzamin) -#else -#define AMIN BLASFUNC(scamin) -#endif -#else -#ifdef DOUBLE -#define AMIN BLASFUNC(damin) -#else -#define AMIN BLASFUNC(samin) -#endif -#endif - -int main(int argc, char *argv[]) -{ - - FLOAT *x; - blasint m, i; - blasint inc_x = 1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1, timeg; - - argc--; - argv++; - - if (argc > 0) - { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) - { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) - { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) - { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) - { - - timeg = 0; - - fprintf(stderr, " %6d : ", (int)m); - - for (l = 0; l < loops; l++) - { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) - { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - begin(); - - AMIN(&m, x, &inc_x); - - end(); - - timeg += getsec(); - } - - timeg /= loops; - - fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef AMIN + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMIN BLASFUNC(dzamin) +#else +#define AMIN BLASFUNC(scamin) +#endif +#else +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x; + blasint m, i; + blasint inc_x = 1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) + { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l = 0; l < loops; l++) + { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + begin(); + + AMIN(&m, x, &inc_x); + + end(); + + timeg += getsec(); + } + + timeg /= loops; + + fprintf(stderr, + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/bench.h b/benchmark/bench.h index c03d72bef..1dae4d0fd 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){ #endif +/* Benchmarks should allocate with cacheline (often 64 bytes) alignment + to avoid unreliable results. This technique, storing the allocated + pointer value just before the aligned memory, doesn't require + C11's aligned_alloc for compatibility with older compilers. */ +static void *aligned_alloc_cacheline(size_t n) +{ + void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1); + if (p) { + void **newp = (void **) + (((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE); + newp[-1] = p; + p = newp; + } + return p; +} +#define malloc aligned_alloc_cacheline +#define free(p) free((p) ? ((void **)(p))[-1] : (p)) + #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; #elif defined(__APPLE__) diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index 35249bdf9..7bf047abd 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -1,134 +1,134 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef HBMV - -#ifdef DOUBLE -#define HBMV BLASFUNC(zhbmv) -#else -#define HBMV BLASFUNC(chbmv) -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {0.0, 0.0}; - blasint k = 1; - char uplo='L'; - blasint m, i, j; - blasint inc_x=1, inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - if ((p = getenv("OPENBLAS_K"))) k = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", - from, to, step, uplo, k, inc_x, inc_y, loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m, (int)m); - - for(j = 0; j < m; j++) { - for(i = 0; i < m * COMPSIZE; i++) { - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l = 0; l < loops; l++) { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - begin(); - - HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - - end(); - - timeg += getsec(); - - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef HBMV + +#ifdef DOUBLE +#define HBMV BLASFUNC(zhbmv) +#else +#define HBMV BLASFUNC(chbmv) +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {0.0, 0.0}; + blasint k = 1; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_K"))) k = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, uplo, k, inc_x, inc_y, loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + begin(); + + HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); + + end(); + + timeg += getsec(); + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index 907e2adc4..0dc296ccc 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -1,133 +1,133 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef HPMV - -#ifdef DOUBLE -#define HPMV BLASFUNC(zhpmv) -#else -#define HPMV BLASFUNC(chpmv) -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; - char uplo='L'; - blasint m, i, j; - blasint inc_x=1, inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m, (int)m); - - for(j = 0; j < m; j++) { - for(i = 0; i < m * COMPSIZE; i++) { - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l = 0; l < loops; l++) { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - begin(); - - HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - - end(); - - time1 = getsec(); - - timeg += time1; - - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef HPMV + +#ifdef DOUBLE +#define HPMV BLASFUNC(zhpmv) +#else +#define HPMV BLASFUNC(chpmv) +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + begin(); + + HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); + + end(); + + time1 = getsec(); + + timeg += time1; + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/iamin.c b/benchmark/iamin.c index a57638ecc..2384641a5 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -1,120 +1,120 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef IAMIN - -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMIN BLASFUNC(izamin) -#else -#define IAMIN BLASFUNC(icamin) -#endif -#else -#ifdef DOUBLE -#define IAMIN BLASFUNC(idamin) -#else -#define IAMIN BLASFUNC(isamin) -#endif -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) - inc_y = atoi(p); - - fprintf( - stderr, - "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", - from, to, step, inc_x, inc_y, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == - NULL) { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - - if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == - NULL) { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) { - - timeg = 0; - - fprintf(stderr, " %6d : ", (int)m); - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - for (l = 0; l < loops; l++) { - begin(); - - ROTM(&m, x, &inc_x, y, &inc_y, param); - - end(); - - time1 = getsec(); - - timeg += time1; - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops %10.6f sec\n", - COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef ROTM + +#ifdef DOUBLE +#define ROTM BLASFUNC(drotm) +#else +#define ROTM BLASFUNC(srotm) +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x, *y; + // FLOAT result; + blasint m, i; + blasint inc_x = 1, inc_y = 1; + FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) + inc_y = atoi(p); + + fprintf( + stderr, + "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, inc_x, inc_y, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + + if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + begin(); + + ROTM(&m, x, &inc_x, y, &inc_y, param); + + end(); + + time1 = getsec(); + + timeg += time1; + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} diff --git a/benchmark/scal.c b/benchmark/scal.c index 8de6cfd04..79bcb6729 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main(int argc, char *argv[]){ - FLOAT *x, *y; + FLOAT *x; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; @@ -74,10 +74,6 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - #ifdef __linux srandom(getpid()); #endif @@ -91,30 +87,20 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + begin(); for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m,(int)m); - - for(j = 0; j < m; j++){ - for(i = 0; i < m * COMPSIZE; i++){ - a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m,(int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + + for (l=0; l -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_QNX) #include #include #endif @@ -107,7 +107,7 @@ extern "C" { #endif #endif -#ifdef OS_HAIKU +#if defined(OS_HAIKU) || defined(OS_QNX) #define NO_SYSV_IPC #endif @@ -387,6 +387,10 @@ typedef int blasint; #endif */ +#ifdef __EMSCRIPTEN__ +#define YIELDING +#endif + #ifndef YIELDING #define YIELDING sched_yield() #endif diff --git a/common_param.h b/common_param.h index 31fba9059..e14ef2782 100644 --- a/common_param.h +++ b/common_param.h @@ -50,6 +50,7 @@ typedef struct { #ifdef BUILD_BFLOAT16 int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; + int sbgemm_align_k; void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); diff --git a/cpuid_x86.c b/cpuid_x86.c index 4ac1de047..4afa931f0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1544,6 +1544,17 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 11: //family 6 exmodel 11 + switch (model) { + case 7: // Raptor Lake + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -2334,6 +2345,18 @@ int get_coretype(void){ return CORE_NEHALEM; } + case 11: + switch (model) { + case 7: // Raptor Lake +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } case 15: if (model <= 0x2) return CORE_NORTHWOOD; else return CORE_PRESCOTT; diff --git a/ctest.c b/ctest.c index df628b1d4..2ccae8dcc 100644 --- a/ctest.c +++ b/ctest.c @@ -173,3 +173,8 @@ HAVE_C11 ARCH_E2K #endif +#if defined(__EMSCRIPTEN__) +ARCH_RISCV64 +OS_WINDOWS +#endif + diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index e779fb168..91338b73b 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -40,7 +40,7 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() add_test(NAME "x${float_char}cblat1" @@ -65,7 +65,7 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() add_test(NAME "x${float_char}cblat2" @@ -90,7 +90,7 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() add_test(NAME "x${float_char}cblat3" diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c index 4993d31bb..57e4707a9 100644 --- a/ctest/c_sblat1c.c +++ b/ctest/c_sblat1c.c @@ -969,7 +969,7 @@ real *sfac; 1.17 }; /* Local variables */ - extern /* Subroutine */ srottest_(); + extern /* Subroutine */ void srottest_(); static integer i__, k, ksize; extern /* Subroutine */ int stest_(), srotmtest_(); static integer ki, kn; diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 4a8e193be..b7328876b 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -304,6 +304,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; } + BLASLONG pad_min_l = min_l; +#if defined(HALF) +#if defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); +#else + pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; +#endif +#endif + /* First, we have to move data A to L2 cache */ min_i = m_to - m_from; l1stride = 1; @@ -350,7 +359,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, - sb + min_l * (jjs - js) * COMPSIZE * l1stride); + sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(outercost); @@ -358,10 +367,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) KERNEL_OPERATION(min_i, min_jj, min_l, alpha, - sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); + sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #else KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, - sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); + sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #endif STOP_RPCC(kernelcost); diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index dfc7107b8..02b60b50d 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -324,6 +324,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } + + BLASLONG pad_min_l = min_l; + +#if defined(HALF) +#if defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); +#else + pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; +#endif +#endif /* Determine step size in m * Note: We are currently on the first step in m @@ -382,13 +392,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Copy part of local region of B into workspace */ START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, - buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride); + buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(copy_B); /* Apply kernel with local region of A and part of local region of B */ START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, - sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride, + sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); STOP_RPCC(kernel); diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 9cfd825ec..051513f27 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -470,9 +470,13 @@ blas_queue_t *tscq; #endif #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif +#endif #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; @@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ queue -> position = pos; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); #endif +#endif #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 1a5fd06a3..e06ab8404 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -69,6 +69,8 @@ int blas_server_avail = 0; +extern int openblas_omp_adaptive_env(); + static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; #ifdef HAVE_C11 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; @@ -282,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ sb = queue -> sb; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif #endif if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { @@ -381,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ #ifdef CONSISTENT_FPCSR for (i = 0; i < num; i ++) { +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); +#endif } #endif diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 33b58f134..afa33cccc 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -278,12 +278,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } @@ -295,11 +298,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9a693b06f..f61930983 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -1018,7 +1018,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 24; i++) + for ( i=1 ; i <= 25; i++) { if (!strncasecmp(coretype,corename[i],20)) { diff --git a/driver/others/init.c b/driver/others/init.c index cc3145a62..cd10e8d36 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -823,6 +823,8 @@ void gotoblas_affinity_init(void) { if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS"); + if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS"); + numnodes = 1; if (numprocs == 1) { diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c index ef91a08e6..35b2270d4 100644 --- a/driver/others/openblas_env.c +++ b/driver/others/openblas_env.c @@ -67,10 +67,16 @@ void openblas_read_env() { openblas_env_thread_timeout=(unsigned int)ret; ret=0; - if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if (readenv(p,"OPENBLAS_DEFAULT_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; openblas_env_openblas_num_threads=ret; + ret=0; + if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + if(ret != 0 || openblas_env_openblas_num_threads == 0) + openblas_env_openblas_num_threads=ret; + ret=0; if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; diff --git a/f_check b/f_check index bb13e1640..02e078b8a 100755 --- a/f_check +++ b/f_check @@ -82,10 +82,6 @@ else vendor=FUJITSU openmp='-Kopenmp' ;; - *Cray*) - vendor=CRAY - openmp='-fopenmp' - ;; *GNU*|*GCC*) v="${data#*GCC: *\) }" @@ -117,6 +113,10 @@ else esac fi ;; + *Cray*) + vendor=CRAY + openmp='-fopenmp' + ;; *g95*) vendor=G95 openmp='' diff --git a/f_check.pl b/f_check.pl index cfc7331c2..f093b9ad5 100644 --- a/f_check.pl +++ b/f_check.pl @@ -76,11 +76,6 @@ if ($compiler eq "") { $vendor = FUJITSU; $openmp = "-Kopenmp"; - } elsif ($data =~ /Cray/) { - - $vendor = CRAY; - $openmp = "-fopenmp"; - } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { $data =~ s/\(+.*?\)+//g; @@ -106,6 +101,10 @@ if ($compiler eq "") { $openmp = ""; } } + } elsif ($data =~ /Cray/) { + + $vendor = CRAY; + $openmp = "-fopenmp"; } diff --git a/getarch.c b/getarch.c index cde5b4e83..f26ca6325 100644 --- a/getarch.c +++ b/getarch.c @@ -1410,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ - "-march=armv8.4-a -mtune=neoverse-v1" + "-march=armv8.4-a+sve -mtune=neoverse-v1" #define LIBNAME "neoversev1" #define CORENAME "NEOVERSEV1" #endif diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 0b2998237..4e082928b 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c - trsm.c syrk.c syr2k.c + trsm.c syrk.c syr2k.c gemmt.c ) set(BLAS3_MANGLED_SOURCES @@ -189,7 +189,16 @@ if (NOT DEFINED NO_LAPACK) ) GenerateNamedObjects("${LAPACK_SOURCES}") + if (NOT RELAPACK_REPLACE) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) + else () + GenerateNamedObjects("lapack/getrs.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/getf2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/potf2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/laswp.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) + endif() endif () if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) diff --git a/interface/Makefile b/interface/Makefile index abdac96e1..6f320d8f7 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -44,12 +44,12 @@ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ - sgeadd.$(SUFFIX) + sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) SBBLAS2OBJS = sbgemv.$(SUFFIX) -SBBLAS3OBJS = sbgemm.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -76,7 +76,7 @@ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ - dgeadd.$(SUFFIX) + dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -105,7 +105,7 @@ CBLAS3OBJS = \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ - cgeadd.$(SUFFIX) + cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -134,7 +134,7 @@ ZBLAS3OBJS = \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ - zgeadd.$(SUFFIX) + zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -281,12 +281,12 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) + cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -306,7 +306,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) + cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -331,7 +331,7 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) CXERBLAOBJ = \ cblas_xerbla.$(SUFFIX) @@ -362,7 +362,7 @@ CZBLAS3OBJS = \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) + cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -1300,6 +1300,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h @@ -1320,6 +1322,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1907,6 +1927,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + +cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemmt.c b/interface/gemmt.c new file mode 100644 index 000000000..3eed1dfe4 --- /dev/null +++ b/interface/gemmt.c @@ -0,0 +1,589 @@ +/*********************************************************************/ +/* Copyright 2022, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536.0 +#ifdef XDOUBLE +#define ERROR_NAME "QGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMT " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMT " +#else +#define ERROR_NAME "SGEMT " +#endif +#else +#define SMP_THRESHOLD_MIN 8192.0 +#ifdef XDOUBLE +#define ERROR_NAME "XGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMT " +#else +#define ERROR_NAME "CGEMT " +#endif +#endif + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * N, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, n, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; +#if defined(COMPLEX) + FLOAT alpha_r, alpha_i, beta_r, beta_i; +#else + FLOAT alpha, beta; +#endif + + PRINT_DEBUG_NAME; + + m = *M; + n = *N; + k = *K; + +#if defined(COMPLEX) + FLOAT *alpha = Alpha; + alpha_r = *(Alpha + 0); + alpha_i = *(Alpha + 1); + + beta_r = *(Beta + 0); + beta_i = *(Beta + 1); +#else + alpha = *Alpha; + beta = *Beta; +#endif + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; +#ifndef COMPLEX + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; +#else + if (transA == 'R') + transa = 2; + if (transA == 'C') + transa = 3; +#endif + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; +#ifndef COMPLEX + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; +#else + if (transB == 'R') + transb = 2; + if (transB == 'C') + transb = 3; +#endif + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + + info = 0; + + if (uplo < 0) + info = 14; + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + if (info) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, + blasint N, blasint k, +#ifndef COMPLEX + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ +#else + void *valpha, + void *va, blasint LDA, + void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc) +{ + FLOAT *alpha = (FLOAT *) valpha; + FLOAT *beta = (FLOAT *) vbeta; + FLOAT *A = (FLOAT *) va; + FLOAT *B = (FLOAT *) vb; + FLOAT *c = (FLOAT *) vc; +#endif + FLOAT *aa, *bb, *cc; + + int transa, transb, uplo; + blasint info; + blasint m, n, lda, ldb; + FLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; +#else + if (TransA == CblasConjNoTrans) + transa = 2; + if (TransA == CblasConjTrans) + transa = 3; +#endif + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; +#else + if (TransB == CblasConjNoTrans) + transb = 2; + if (TransB == CblasConjTrans) + transb = 3; +#endif + + m = M; + n = N; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + } + + if (order == CblasRowMajor) { + m = N; + n = M; + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; +#else + if (TransB == CblasConjNoTrans) + transa = 2; + if (TransB == CblasConjTrans) + transa = 3; +#endif + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; +#else + if (TransA == CblasConjNoTrans) + transb = 2; + if (TransA == CblasConjTrans) + transb = 3; +#endif + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + } + + uplo = -1; + if (Uplo == CblasUpper) + uplo = 0; + if (Uplo == CblasLower) + uplo = 1; + if (uplo < 0) + info = 14; + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#if defined(COMPLEX) + FLOAT alpha_r = *(alpha + 0); + FLOAT alpha_i = *(alpha + 1); + + FLOAT beta_r = *(beta + 0); + FLOAT beta_i = *(beta + 1); +#endif + +#endif + int buffer_size; + blasint l; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + +#if defined(COMPLEX) + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, + xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, + xgemv_thread_d, +#elif defined DOUBLE + zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, + zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, + zgemv_thread_d, +#else + cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, + cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, + cgemv_thread_d, +#endif + }; +#endif + + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, + FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,}; + +#else + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgemv_thread_n, qgemv_thread_t, +#elif defined DOUBLE + dgemv_thread_n, dgemv_thread_t, +#else + sgemv_thread_n, sgemv_thread_t, +#endif + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T,}; + +#endif + + if ((m == 0) || (n == 0)) + return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + const blasint incb = (transb == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < n; i++) { + j = n - i; + + l = j; +#if defined(COMPLEX) + aa = a + i * 2; + bb = b + i * ldb * 2; + if (transa) { + l = k; + aa = a + lda * i * 2; + bb = b + i * 2; + } + cc = c + i * 2 * ldc + i * 2; +#else + aa = a + i; + bb = b + i * ldb; + if (transa) { + l = k; + aa = a + lda * i; + bb = b + i; + } + cc = c + i * ldc + i; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); +#endif +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < n; i++) { + j = i + 1; + + l = j; +#if defined COMPLEX + bb = b + i * ldb * 2; + if (transa) { + l = k; + bb = b + i * 2; + } + cc = c + i * 2 * ldc; +#else + bb = b + i * ldb; + if (transa) { + l = k; + bb = b + i; + } + cc = c + i * ldc; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); +#endif + +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + args.m * args.k + args.k * args.n + + args.m * args.n, 2 * args.m * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/kernel/Makefile b/kernel/Makefile index cbe4cde6e..977886044 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG) # Any clang posing as gcc 4.2 should be new enough (3.4 or later) GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) - AVX2OPT = -mavx2 + AVX2OPT = -mavx2 -mfma endif endif ifdef NO_AVX2 @@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), ZEN) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else ifeq ($(TARGET_CORE), LOONGSON3R4) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index ea010db42..9a5938459 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index 07a94a043..b743d1a43 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S @@ -190,10 +190,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMM_BETA = sbgemm_beta_neoversen2.c SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c -SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c -SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c -SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c -SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c +SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c +SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index ea010db42..9a5938459 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a20d0d4a6..41cedc851 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index a20d0d4a6..41cedc851 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c new file mode 100644 index 000000000..4607ebc59 --- /dev/null +++ b/kernel/arm64/dot.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2022, Arm Ltd +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +// Some compilers will report feature support for SVE without the appropriate +// header available +#ifdef HAVE_SVE +#if defined __has_include +#if __has_include() && __ARM_FEATURE_SVE +#define USE_SVE +#endif +#endif +#endif + +#ifdef USE_SVE +#include "dot_kernel_sve.c" +#endif +#include "dot_kernel_asimd.c" + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + RETURN_TYPE dot = 0.0 ; + + if ( n <= 0 ) return dot; + +#ifdef USE_SVE + if (inc_x == 1 && inc_y == 1) { + return dot_kernel_sve(n, x, y); + } +#endif + + return dot_kernel_asimd(n, x, inc_x, y, inc_y); +} + +#if defined(SMP) +static int dot_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); + + return 0; +} +#endif + +RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + RETURN_TYPE dot = 0.0; + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + dot = dot_compute(n, x, inc_x, y, inc_y); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + RETURN_TYPE *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)dot_thread_function, nthreads); + + ptr = (RETURN_TYPE *)result; + for (i = 0; i < nthreads; i++) { + dot = dot + (*ptr); + ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + dot = dot_compute(n, x, inc_x, y, inc_y); +#endif + + return dot; +} diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_kernel_asimd.c similarity index 53% rename from kernel/arm64/dot_thunderx2t99.c rename to kernel/arm64/dot_kernel_asimd.c index 3940acddd..1288838f8 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_kernel_asimd.c @@ -1,5 +1,6 @@ /*************************************************************************** Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -36,25 +37,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RETURN_TYPE double #endif -#define N "x0" /* vector length */ -#define X "x1" /* "X" vector address */ -#define INC_X "x2" /* "X" stride */ -#define Y "x3" /* "Y" vector address */ -#define INC_Y "x4" /* "Y" stride */ -#define J "x5" /* loop variable */ - #if !defined(DOUBLE) #if !defined(DSDOT) +#define DOT_MOD "s" #define REG0 "wzr" -#define DOTF "s0" #define TMPX "s16" #define TMPY "s24" #define INC_SHIFT "2" #define N_DIV_SHIFT "6" #define N_REM_MASK "63" #else +#define DOT_MOD "d" #define REG0 "xzr" -#define DOTF "d0" #define TMPX "s16" #define TMPX1 "d2" #define TMPY "s24" @@ -64,8 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N_REM_MASK "15" #endif #else +#define DOT_MOD "d" #define REG0 "xzr" -#define DOTF "d0" #define TMPX "d16" #define TMPY "d24" #define INC_SHIFT "3" @@ -73,59 +67,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N_REM_MASK "31" #endif +#define OUT "%"DOT_MOD"[DOT_]" + #if !defined(DOUBLE) #if !defined(DSDOT) #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ - " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ + " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" #define KERNEL_F \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" @@ -142,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /* !defined(DSDOT) */ #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ " fcvt "TMPX1", "TMPX" \n" \ " fcvt "TMPY1", "TMPY" \n" \ " fmul "TMPX1", "TMPX1", "TMPY1" \n" \ - " fadd "DOTF", "DOTF", "TMPX1" \n" + " fadd "OUT", "OUT", "TMPX1" \n" #define KERNEL_F \ - " ldp q18, q19, ["X"] \n" \ - " ldp q26, q27, ["Y"] \n" \ + " ldp q18, q19, [%[X_]] \n" \ + " ldp q26, q27, [%[Y_]] \n" \ " fcvtl v16.2d, v18.2s \n" \ " fcvtl2 v17.2d, v18.4s \n" \ " fcvtl v18.2d, v19.2s \n" \ @@ -163,8 +159,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fcvtl2 v25.2d, v26.4s \n" \ " fcvtl v26.2d, v27.2s \n" \ " fcvtl2 v27.2d, v27.4s \n" \ - " ldp q22, q23, ["X", #32] \n" \ - " ldp q30, q31, ["Y", #32] \n" \ + " ldp q22, q23, [%[X_], #32] \n" \ + " ldp q30, q31, [%[Y_], #32] \n" \ " fcvtl v20.2d, v22.2s \n" \ " fcvtl2 v21.2d, v22.4s \n" \ " fcvtl v22.2d, v23.2s \n" \ @@ -173,16 +169,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fcvtl2 v29.2d, v30.4s \n" \ " fcvtl v30.2d, v31.2s \n" \ " fcvtl2 v31.2d, v31.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " add "Y", "Y", #64 \n" \ - " add "X", "X", #64 \n" \ + " add %[Y_], %[Y_], #64 \n" \ + " add %[X_], %[X_], #64 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ @@ -196,60 +192,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ - " faddp "DOTF", v0.2d \n" + " faddp "OUT", v0.2d \n" #endif /* !defined(DSDOT) */ #else /* !defined(DOUBLE) */ #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ - " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ + " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" #define KERNEL_F \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" @@ -261,28 +257,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ - " faddp "DOTF", v0.2d \n" + " faddp "OUT", v0.2d \n" #endif /* !defined(DOUBLE) */ -#if defined(SMP) -extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, - BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); -#endif - -static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - RETURN_TYPE dot = 0.0 ; - - if ( n < 0 ) return dot; + RETURN_TYPE dot = 0.0; + BLASLONG j = 0; __asm__ __volatile__ ( - " mov "N", %[N_] \n" - " mov "X", %[X_] \n" - " mov "INC_X", %[INCX_] \n" - " mov "Y", %[Y_] \n" - " mov "INC_Y", %[INCY_] \n" - " fmov "DOTF", "REG0" \n" + " fmov "OUT", "REG0" \n" " fmov d1, xzr \n" " fmov d2, xzr \n" " fmov d3, xzr \n" @@ -290,42 +274,40 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " fmov d5, xzr \n" " fmov d6, xzr \n" " fmov d7, xzr \n" - " cmp "N", xzr \n" - " ble 9f //dot_kernel_L999 \n" - " cmp "INC_X", #1 \n" + " cmp %[INCX_], #1 \n" " bne 5f //dot_kernel_S_BEGIN \n" - " cmp "INC_Y", #1 \n" + " cmp %[INCY_], #1 \n" " bne 5f //dot_kernel_S_BEGIN \n" "1: //dot_kernel_F_BEGIN: \n" - " lsl "INC_X", "INC_X", "INC_SHIFT" \n" - " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" - " asr "J", "N", #"N_DIV_SHIFT" \n" - " cmp "J", xzr \n" + " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" + " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" + " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" + " cmp %[J_], xzr \n" " beq 3f //dot_kernel_F1 \n" " .align 5 \n" "2: //dot_kernel_F: \n" " "KERNEL_F" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 2b //dot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" "3: //dot_kernel_F1: \n" - " ands "J", "N", #"N_REM_MASK" \n" + " ands %[J_], %[N_], #"N_REM_MASK" \n" " ble 9f //dot_kernel_L999 \n" "4: //dot_kernel_F10: \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 4b //dot_kernel_F10 \n" " b 9f //dot_kernel_L999 \n" "5: //dot_kernel_S_BEGIN: \n" - " lsl "INC_X", "INC_X", "INC_SHIFT" \n" - " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" - " asr "J", "N", #2 \n" - " cmp "J", xzr \n" + " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" + " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" + " asr %[J_], %[N_], #2 \n" + " cmp %[J_], xzr \n" " ble 7f //dot_kernel_S1 \n" "6: //dot_kernel_S4: \n" @@ -333,88 +315,31 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 6b //dot_kernel_S4 \n" "7: //dot_kernel_S1: \n" - " ands "J", "N", #3 \n" + " ands %[J_], %[N_], #3 \n" " ble 9f //dot_kernel_L999 \n" "8: //dot_kernel_S10: \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 8b //dot_kernel_S10 \n" "9: //dot_kernel_L999: \n" - " str "DOTF", [%[DOT_]] \n" - : - : [DOT_] "r" (&dot), //%0 - [N_] "r" (n), //%1 - [X_] "r" (x), //%2 - [INCX_] "r" (inc_x), //%3 - [Y_] "r" (y), //%4 - [INCY_] "r" (inc_y) //%5 + : [DOT_] "=&w" (dot) + : [N_] "r" (n), + [X_] "r" (x), + [INCX_] "r" (inc_x), + [Y_] "r" (y), + [INCY_] "r" (inc_y), + [J_] "r" (j) : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return dot; } - -#if defined(SMP) -static int dot_thread_function(BLASLONG n, BLASLONG dummy0, - BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) -{ - *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); - - return 0; -} -#endif - -RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ -#if defined(SMP) - int nthreads; - FLOAT dummy_alpha; -#endif - RETURN_TYPE dot = 0.0; - -#if defined(SMP) - if (inc_x == 0 || inc_y == 0 || n <= 10000) - nthreads = 1; - else - nthreads = num_cpu_avail(1); - - if (nthreads == 1) { - dot = dot_compute(n, x, inc_x, y, inc_y); - } else { - int mode, i; - char result[MAX_CPU_NUMBER * sizeof(double) * 2]; - RETURN_TYPE *ptr; - -#if !defined(DOUBLE) - mode = BLAS_SINGLE | BLAS_REAL; -#else - mode = BLAS_DOUBLE | BLAS_REAL; -#endif - - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, - x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); - - ptr = (RETURN_TYPE *)result; - for (i = 0; i < nthreads; i++) { - dot = dot + (*ptr); - ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); - } - } -#else - dot = dot_compute(n, x, inc_x, y, inc_y); -#endif - - return dot; -} diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c new file mode 100644 index 000000000..8460e0d5e --- /dev/null +++ b/kernel/arm64/dot_kernel_sve.c @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2022, Arm Ltd +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { + SVE_TYPE acc_a = SVE_ZERO; + SVE_TYPE acc_b = SVE_ZERO; + + BLASLONG sve_width = SVE_WIDTH; + + for (BLASLONG i = 0; i < n; i += sve_width * 2) { + svbool_t pg_a = SVE_WHILELT(i, n); + svbool_t pg_b = SVE_WHILELT(i + sve_width, n); + + SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); + SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); + SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); + SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + + acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); + acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); + } + + return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); +} diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c index 66e7dd38a..4c1385fbe 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c @@ -37,9 +37,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { - if (alpha == 1.0f) - return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); - else - return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); - return 0; + if (alpha == 1.0f) + return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); + else + return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); + return 0; } diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c index 7d53b1aa0..26ea7ee61 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c @@ -30,636 +30,442 @@ #include "common.h" +#define INIT_C(M, N) mc##M##N = svdup_f32(0); + +#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + +#define INIT_C_8x4 \ + do { \ + INIT_C(0, 0); \ + INIT_C(0, 1); \ + INIT_C(1, 0); \ + INIT_C(1, 1); \ + INIT_C(2, 0); \ + INIT_C(2, 1); \ + INIT_C(3, 0); \ + INIT_C(3, 1); \ + } while (0); + #ifdef ALPHA_ONE -#define LOAD_C(M, N) \ - mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svadd_z((PG), SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#else +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svmad_z((PG), svalpha, SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#endif -#define LOAD_C_LOW(M, N) \ - mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc); +#ifdef ALPHA_ONE +int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#else +int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG pad_k = (k + 3) & ~3; + + svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; + svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, + vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, + oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; + svfloat32_t svalpha = svdup_f32(alpha); + + svbool_t pg16 = svptrue_b16(); + svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbool_t pg32 = svptrue_b32(); + svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); + svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); + + bfloat16_t *ptr_a = (bfloat16_t *)A; + bfloat16_t *ptr_b = (bfloat16_t *)B; + FLOAT *ptr_c = C; + + bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; + bfloat16_t *ptr_b0, *ptr_b1; + FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + + for (BLASLONG j = 0; j < n / 4; j++) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c2 = ptr_c1 + ldc; + ptr_c3 = ptr_c2 + ldc; + ptr_c += 4 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C_8x4; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + MATMUL(2, 0); MATMUL(2, 1); + MATMUL(3, 0); MATMUL(3, 1); + + ptr_a0 += 32; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); + vc4 = svuzp1(mc01, mc11); + vc5 = svuzp1(mc21, mc31); + vc6 = svuzp2(mc01, mc11); + vc7 = svuzp2(mc21, mc31); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0+4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1+4, oc3, vc3); + UPDATE_C(pg32, ptr_c2, oc4, vc4) + UPDATE_C(pg32, ptr_c2+4, oc5, vc5); + UPDATE_C(pg32, ptr_c3, oc6, vc6) + UPDATE_C(pg32, ptr_c3+4, oc7, vc7); + + ptr_c0 += 8; + ptr_c1 += 8; + ptr_c2 += 8; + ptr_c3 += 8; + } -#define LOAD_C_EVEN(M, N) \ - mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc); + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + INIT_C(1, 0); INIT_C(1, 1); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + + ptr_a0 += 16; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + vc2 = svuzp1(mc01, mc11); + vc3 = svuzp2(mc01, mc11); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + UPDATE_C(pg32, ptr_c2, oc2, vc2); + UPDATE_C(pg32, ptr_c3, oc3, vc3); + + ptr_c0 += 4; + ptr_c1 += 4; + ptr_c2 += 4; + ptr_c3 += 4; + } -#define LOAD_C_FIRST(M, N) \ - mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc); + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + + ptr_a0 += 8; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + vc2 = svuzp1(mc01, mc01); + vc3 = svuzp2(mc01, mc01); + + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + UPDATE_C(pg32_low, ptr_c2, oc2, vc2); + UPDATE_C(pg32_low, ptr_c3, oc3, vc3); + + ptr_c0 += 2; + ptr_c1 += 2; + ptr_c2 += 2; + ptr_c3 += 2; + } -#define STORE_C(M, N) \ - svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; -#define STORE_C_LOW(M, N) \ - svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); -#define STORE_C_EVEN(M, N) \ - svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); + MATMUL(0, 0); MATMUL(0, 1); -#define STORE_C_FIRST(M, N) \ - svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); + ptr_a0 += 4; + ptr_b0 += 16; + } -#else -#define LOAD_C(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); + vc1 = svuzp2(mc00, mc00); + vc3 = svuzp2(mc01, mc01); -#define LOAD_C_LOW(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc); + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + UPDATE_C(pg32_first, ptr_c2, oc2, mc01); + UPDATE_C(pg32_first, ptr_c3, oc3, vc3); -#define LOAD_C_EVEN(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc); + } -#define LOAD_C_FIRST(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc); + ptr_b += 4 * pad_k; + } -#define STORE_C(M, N) \ - mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); + if (n & 2) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c += 2 * ldc; + ptr_a = (bfloat16_t *)A; -#define STORE_C_LOW(M, N) \ - mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; -#define STORE_C_EVEN(M, N) \ - mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); + ptr_b0 = ptr_b; -#define STORE_C_FIRST(M, N) \ - mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); -#endif + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); -#define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M); + mb0 = svld1_bf16(pg16, ptr_b0); -#define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N); + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); -#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + ptr_a0 += 32; + ptr_b0 += 8; + } -#define LOAD_KREST_1(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \ - *(ptr_##NAME##M + 1), zero, zero, zero); + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); -#define LOAD_KREST_1_LOW(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \ - zero, zero); + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3); -#define LOAD_KREST_2(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \ - *(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero); + ptr_c0 += 8; + ptr_c1 += 8; + } -#define LOAD_KREST_2_LOW(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \ - zero, zero, zero, zero, zero); + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; -#define LOAD_KREST_3(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ - *(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \ - *(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero); + INIT_C(0, 0); + INIT_C(1, 0); -#define LOAD_KREST_3_LOW(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ - *(ptr_##NAME##M + 2), zero, zero, zero, zero, zero); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 8; + } + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + + ptr_c0 += 4; + ptr_c1 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + + ptr_c0 += 2; + ptr_c1 += 2; -#ifdef ALPHA_ONE -int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#else -int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#endif -{ - bfloat16_t *ptr_a = (bfloat16_t *)A; - bfloat16_t *ptr_b = (bfloat16_t *)B; - FLOAT *ptr_c = C; - - bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; - bfloat16_t *ptr_b0, *ptr_b1; - FLOAT *ptr_c00, *ptr_c01; - - svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; - svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31; -#ifndef ALPHA_ONE - svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31; -#endif - svbool_t pg16 = svptrue_b16(); - svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); - svbool_t pg32 = svptrue_b32(); - svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); - svbool_t pg32_even = svdupq_b32(1, 0, 1, 0); - svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); - svfloat32_t svalpha = svdup_f32(alpha); - bfloat16 tmp = 0; - bfloat16_t zero = *((bfloat16_t *)&tmp); - BLASLONG krest = k & 3; - - // 00 01 10 11 - svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1); - - for (BLASLONG j = 0; j < n / 4; j++) { - ptr_c00 = ptr_c; - ptr_c01 = ptr_c + 2 * ldc; - ptr_c += 4 * ldc; - - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - LOAD_C(2, 0); LOAD_C(2, 1); - LOAD_C(3, 0); LOAD_C(3, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - } - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - STORE_C(2, 0); STORE_C(2, 1); - STORE_C(3, 0); STORE_C(3, 1); - - ptr_c00 += 8; ptr_c01 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - } - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - - ptr_c00 += 4; ptr_c01 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - } - STORE_C(0, 0); STORE_C(0, 1); - ptr_c00 += 2; ptr_c01 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 4; - ptr_b0 += 8; - ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - } - STORE_C_LOW(0, 0); STORE_C_LOW(0, 1); - } - - ptr_b += 4 * k; } - if (n & 2) { - ptr_c00 = ptr_c; - ptr_c += 2 * ldc; - - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - LOAD_C(2, 0); - LOAD_C(3, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - LOAD_B(0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - } - - STORE_C(0, 0); - STORE_C(1, 0); - STORE_C(2, 0); - STORE_C(3, 0); - - ptr_c00 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - LOAD_B(0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - } - STORE_C(0, 0) - STORE_C(1, 0) - - ptr_c00 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - LOAD_B(0); - MATMUL(0, 0); - ptr_a0 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - } - STORE_C(0, 0); - ptr_c00 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - LOAD_B(0); - MATMUL(0, 0); - ptr_a0 += 4; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - } - STORE_C_LOW(0, 0); - } - - ptr_b += 2 * k; + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 8; + } + vc1 = svuzp2(mc00, mc00); + + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + } + + ptr_b += 2 * pad_k; + } + + if (n & 1) { + ptr_c0 = ptr_c; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); + + ptr_a0 += 32; + ptr_b0 += 4; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + + ptr_c0 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + INIT_C(0, 0); + INIT_C(1, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc10); + UPDATE_C(pg32, ptr_c0, oc0, vc0); + ptr_c0 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + ptr_c0 += 2; } - if (n & 1) { - ptr_c00 = ptr_c; - ptr_a = (bfloat16_t *) A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - LOAD_C_EVEN(1, 0); - LOAD_C_EVEN(2, 0); - LOAD_C_EVEN(3, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - } - STORE_C_EVEN(0, 0) - STORE_C_EVEN(1, 0); - STORE_C_EVEN(2, 0); - STORE_C_EVEN(3, 0); - - ptr_c00 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - LOAD_C_EVEN(1, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - } - STORE_C_EVEN(0, 0) - STORE_C_EVEN(1, 0) - - ptr_c00 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - } - STORE_C_EVEN(0, 0); - ptr_c00 += 2; - } - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - LOAD_C_FIRST(0, 0); - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 4; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - } - STORE_C_FIRST(0, 0); - } + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 4; + } + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); } + } - return 0; -} \ No newline at end of file + return 0; +} diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c new file mode 100644 index 000000000..22978a388 --- /dev/null +++ b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c @@ -0,0 +1,126 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset; + IFLOAT *a_offsetx[4]; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbfloat16_t v0, v1, v2, v3; + + for (BLASLONG j = 0; j < n / 4; j++) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offset += 4 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + + b_offset += 16; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offset += 2 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + + b_offset += 8; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 2; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offsetx[0] = a_offset; + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + b_offset += 4; + a_offsetx[0] += 4; + } + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = a_offsetx[0][0]; + b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; + b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; + b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + } + } + + return 0; +} diff --git a/kernel/arm64/sbgemm_ncopy_neoversen2.c b/kernel/arm64/sbgemm_ncopy_neoversen2.c deleted file mode 100644 index 594067ebb..000000000 --- a/kernel/arm64/sbgemm_ncopy_neoversen2.c +++ /dev/null @@ -1,101 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - - a_offset = a; - b_offset = b; - - for (BLASLONG j = 0; j < n / 2; j++) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset1 + 2); - *(b_offset + 3) = *(a_offset1 + 3); - *(b_offset + 4) = *(a_offset2 + 0); - *(b_offset + 5) = *(a_offset2 + 1); - *(b_offset + 6) = *(a_offset2 + 2); - *(b_offset + 7) = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - b_offset += 8; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset1 + 2); - *(b_offset + 3) = *(a_offset2 + 0); - *(b_offset + 4) = *(a_offset2 + 1); - *(b_offset + 5) = *(a_offset2 + 2); - b_offset += 6; - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset2 + 0); - *(b_offset + 3) = *(a_offset2 + 1); - b_offset += 4; - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - b_offset += 2; - } - } - if (n & 1) { - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - *(b_offset + 2) = *(a_offset + 2); - *(b_offset + 3) = *(a_offset + 3); - - b_offset += 4; - a_offset += 4; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - *(b_offset + 2) = *(a_offset + 2); - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c new file mode 100644 index 000000000..a058b5a8e --- /dev/null +++ b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c @@ -0,0 +1,165 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 8; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 8; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + + b_offset += 32; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 8; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 32; + } + } + + if (n & 4) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 4; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + + b_offset += 16; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 2; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + b_offset += 8; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + b_offset[0] = *a_offset0; + b_offset[1] = *a_offset1; + b_offset[2] = *a_offset2; + b_offset[3] = *a_offset3; + b_offset += 4; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = *a_offset0; + b_offset[1] = rest == 1 ? 0 : *a_offset1; + b_offset[2] = rest <= 2 ? 0 : *a_offset2; + b_offset[3] = rest <= 3 ? 0 : *a_offset3; + } + } + return 0; +} diff --git a/kernel/arm64/sbgemm_tcopy_neoversen2.c b/kernel/arm64/sbgemm_tcopy_neoversen2.c deleted file mode 100644 index 2f3313379..000000000 --- a/kernel/arm64/sbgemm_tcopy_neoversen2.c +++ /dev/null @@ -1,109 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - IFLOAT *b_offset; - a_offset = a; - b_offset = b; - - for (BLASLONG j = 0; j < n / 2; j++) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 2; - - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset3 + 0); - *(b_offset + 3) = *(a_offset4 + 0); - *(b_offset + 4) = *(a_offset1 + 1); - *(b_offset + 5) = *(a_offset2 + 1); - *(b_offset + 6) = *(a_offset3 + 1); - *(b_offset + 7) = *(a_offset4 + 1); - - b_offset += 8; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; - a_offset4 += 4 * lda; - } - - if (m & 3) { - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset3 + 0); - *(b_offset + 3) = *(a_offset1 + 1); - *(b_offset + 4) = *(a_offset2 + 1); - *(b_offset + 5) = *(a_offset3 + 1); - b_offset += 6; - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset1 + 1); - *(b_offset + 3) = *(a_offset2 + 1); - b_offset += 4; - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - b_offset += 2; - } - } - } - if (n & 1) { - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - *(b_offset + 2) = *(a_offset + lda * 2); - *(b_offset + 3) = *(a_offset + lda * 3); - - b_offset += 4; - a_offset += 4 * lda; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - *(b_offset + 2) = *(a_offset + lda * 2); - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset); - } - } - - return 0; -} diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S index 30450cc7d..c819ee6fb 100644 --- a/kernel/arm64/sgemm_ncopy_4.S +++ b/kernel/arm64/sgemm_ncopy_4.S @@ -1,333 +1,333 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A00 x2 -#define LDA x3 -#define B00 x4 - -#define A01 x5 -#define A02 x6 -#define A03 x7 -#define A04 x8 - -#define I x9 -#define J x10 - -#define TEMP1 x11 -#define TEMP2 x12 - -#define A_PREFETCH 2560 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -.macro COPY4x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - ldr q2, [A03], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - - ldr q3, [A04], #16 - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] - add B00, B00, #64 - -.endm - -.macro COPY1x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - ldr s2, [A03], #4 - ldr s3, [A04], #4 - - stp s0, s1, [B00] - add B00, B00, #8 - stp s2, s3, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] - add B00, B00, #32 -.endm - - -.macro COPY1x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - - stp s0, s1, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01], #16 - str q0, [B00], #16 -.endm - - -.macro COPY1x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01], #4 - str s0, [B00], #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - -.Ldgemm_ncopy_L4_BEGIN: - - asr J, N, #2 // J = N / 4 - cmp J, #0 - ble .Ldgemm_ncopy_L2_BEGIN - - .align 5 -.Ldgemm_ncopy_L4_M4_BEGIN: - - mov A01, A00 - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A00, A04, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_40 - - .align 5 -.Ldgemm_ncopy_L4_M4_20: - - COPY4x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_20 - -.Ldgemm_ncopy_L4_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_END - - .align 5 -.Ldgemm_ncopy_L4_M4_60: - - COPY1x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_60 - -.Ldgemm_ncopy_L4_M4_END: - - subs J , J, #1 // j-- - bne .Ldgemm_ncopy_L4_M4_BEGIN - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L2_BEGIN: - - tst N, #3 - ble .Ldgemm_ncopy_L999 - - tst N, #2 - ble .Ldgemm_ncopy_L1_BEGIN - -.Ldgemm_ncopy_L2_M4_BEGIN: - mov A01, A00 - add A02, A01, LDA - add A00, A02, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_40 - - .align 5 -.Ldgemm_ncopy_L2_M4_20: - - COPY4x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_20 - -.Ldgemm_ncopy_L2_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_END - - .align 5 -.Ldgemm_ncopy_L2_M4_60: - - COPY1x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_60 - -.Ldgemm_ncopy_L2_M4_END: - - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L1_BEGIN: - - tst N, #1 - ble .Ldgemm_ncopy_L999 - -.Ldgemm_ncopy_L1_M4_BEGIN: - - mov A01, A00 - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_40 - - .align 5 -.Ldgemm_ncopy_L1_M4_20: - - COPY4x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_20 - - -.Ldgemm_ncopy_L1_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_END - - .align 5 -.Ldgemm_ncopy_L1_M4_60: - - COPY1x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_60 - - -.Ldgemm_ncopy_L1_M4_END: - -.Ldgemm_ncopy_L999: - - mov x0, #0 - RESTORE_REGS - ret - - EPILOGUE - +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 + +#define I x9 +#define J x10 + +#define TEMP1 x11 +#define TEMP2 x12 + +#define A_PREFETCH 2560 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + + ldr q3, [A04], #16 + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] + add B00, B00, #64 + +.endm + +.macro COPY1x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ldr s2, [A03], #4 + ldr s3, [A04], #4 + + stp s0, s1, [B00] + add B00, B00, #8 + stp s2, s3, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] + add B00, B00, #32 +.endm + + +.macro COPY1x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + + stp s0, s1, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01], #16 + str q0, [B00], #16 +.endm + + +.macro COPY1x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Ldgemm_ncopy_L4_BEGIN: + + asr J, N, #2 // J = N / 4 + cmp J, #0 + ble .Ldgemm_ncopy_L2_BEGIN + + .align 5 +.Ldgemm_ncopy_L4_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_40 + + .align 5 +.Ldgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_20 + +.Ldgemm_ncopy_L4_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_END + + .align 5 +.Ldgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_60 + +.Ldgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne .Ldgemm_ncopy_L4_M4_BEGIN + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble .Ldgemm_ncopy_L999 + + tst N, #2 + ble .Ldgemm_ncopy_L1_BEGIN + +.Ldgemm_ncopy_L2_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_40 + + .align 5 +.Ldgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_20 + +.Ldgemm_ncopy_L2_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_END + + .align 5 +.Ldgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_60 + +.Ldgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Ldgemm_ncopy_L999 + +.Ldgemm_ncopy_L1_M4_BEGIN: + + mov A01, A00 + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_40 + + .align 5 +.Ldgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_20 + + +.Ldgemm_ncopy_L1_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_END + + .align 5 +.Ldgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_60 + + +.Ldgemm_ncopy_L1_M4_END: + +.Ldgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 431f1ae2a..3066421bb 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -1,814 +1,814 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A x2 -#define LDA x3 -#define B x4 - -#define M8 x5 - -#define A01 x6 -#define A02 x7 -#define A03 x8 -#define A04 x9 -#define A05 x10 -#define A06 x11 -#define A07 x12 -#define A08 x13 - -#define B01 x14 -#define B02 x15 -#define B03 x16 -#define B04 x17 -#define B00 x22 - - -#define I x21 -#define J x19 - -#define TEMP1 x20 - -#define A_PREFETCH 256 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - //prfm PSTL1KEEP, [B00, M8] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] - add A05, A05, #64 - - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] - add A06, A06, #64 - - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] - add A07, A07, #64 - - st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] - add A08, A08, #64 - - st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - add B00, B00, M8 - -.endm - -.macro COPY8x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 - - ldp q8, q9, [A05] - ldp q10, q11, [A06] - add A05, A05, #32 - add A06, A06, #32 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] - add B01, B01, #64 - - ldp q12, q13, [A07] - ldp q14, q15, [A08] - add A07, A07, #32 - add A08, A08, #32 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - add B02, B02, #64 - - ldr q4, [A05] - ldr q5, [A06] - ldr q6, [A07] - ldr q7, [A08] - - add A05, A05, #16 - add A06, A06, #16 - add A07, A07, #16 - add A08, A08, #16 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] - add B02, B02, #64 -.endm - -.macro COPY2x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - add B03, B03, #16 - - ldr d4, [A05] - ldr d5, [A06] - ldr d6, [A07] - ldr d7, [A08] - - add A05, A05, #8 - add A06, A06, #8 - add A07, A07, #8 - add A08, A08, #8 - - stp d4, d5, [B03] - add B03, B03, #16 - stp d6, d7, [B03] - add B03, B03, #16 - -.endm - -.macro COPY1x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - - ldr s4, [A05] - ldr s5, [A06] - ldr s6, [A07] - ldr s7, [A08] - - stp s4, s5, [B04] - add B04, B04, #8 - stp s6, s7, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ -.macro COPY16x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - - add B00, B00, M8 -.endm - -.macro COPY8x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - - add B02, B02, #64 -.endm - -.macro COPY2x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - - add B03, B03, #16 -.endm - -.macro COPY1x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - add A01, A01, #4 - add A02, A02, #4 - add A03, A03, #4 - add A04, A04, #4 - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add B00, B00, M8 -.endm - -.macro COPY8x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s}, [A01] - ld1 {v2.4s, v3.4s}, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - add A01, A01, #16 - add A02, A02, #16 - - stp q0, q1, [B02] - add B02, B02, #32 -.endm - -.macro COPY2x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - - add A01, A01, #8 - add A02, A02, #8 - - stp d0, d1, [B03] - add B03, B03, #16 -.endm - -.macro COPY1x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - - add A01, A01, #4 - add A02, A02, #4 - - stp s0, s1, [B04] - - add B04, B04, #8 -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add B00, B00, M8 -.endm - -.macro COPY8x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldp q0, q1, [A01] - add A01, A01, #32 - stp q0, q1, [B01] - - add B01, B01, #32 -.endm - -.macro COPY4x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01] - add A01, A01, #16 - str q0, [B02] - - add B02, B02, #16 -.endm - -.macro COPY2x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr d0, [A01] - add A01, A01, #8 - str d0, [B03] - - add B03, B03, #8 -.endm - -.macro COPY1x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01] - add A01, A01, #4 - str s0, [B04] - - add B04, B04, #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - - lsl TEMP1, M, #2 // TEMP1 = M * SIZE - - and B01 , N , #-16 - and B02 , N , #-8 - and B03 , N , #-4 - and B04 , N , #-2 - - mul B01, B01, TEMP1 - mul B02, B02, TEMP1 - mul B03, B03, TEMP1 - mul B04, B04, TEMP1 - - add B01 , B01, B - add B02 , B02, B - add B03 , B03, B - add B04 , B04, B - - lsl M8, M, #6 // M8 = M * 16 * SIZE - -.Lsgemm_tcopy_L8_BEGIN: - asr J, M, #3 // J = M / 8 - cmp J, #0 - ble .Lsgemm_tcopy_L4_BEGIN - - .align 5 -.Lsgemm_tcopy_L8_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A05, A04, LDA - add A06, A05, LDA - add A07, A06, LDA - add A08, A07, LDA - add A, A08, LDA - - mov B00, B - add B, B00, #512 // B = B + 8 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L8_M16_40 - - .align 5 -.Lsgemm_tcopy_L8_M16_20: - - COPY16x8 - - subs I , I , #1 - bne .Lsgemm_tcopy_L8_M16_20 - -.Lsgemm_tcopy_L8_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L8_M16_60 - - COPY8x8 - -.Lsgemm_tcopy_L8_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L8_M16_80 - - COPY4x8 - -.Lsgemm_tcopy_L8_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L8_M16_100 - - COPY2x8 - -.Lsgemm_tcopy_L8_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L8_M16_END - - COPY1x8 - -.Lsgemm_tcopy_L8_M16_END: - - subs J , J, #1 // j-- - bne .Lsgemm_tcopy_L8_M16_BEGIN - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L4_BEGIN: - tst M, #7 - ble .Lsgemm_tcopy_L999 - - tst M, #4 - ble .Lsgemm_tcopy_L2_BEGIN - -.Lsgemm_tcopy_L4_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A, A04, LDA - - mov B00, B - add B, B00, #256 // B = B + 4 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L4_M16_40 - - .align 5 -.Lsgemm_tcopy_L4_M16_20: - - COPY16x4 - - subs I , I , #1 - bne .Lsgemm_tcopy_L4_M16_20 - -.Lsgemm_tcopy_L4_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L4_M16_60 - - COPY8x4 - -.Lsgemm_tcopy_L4_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L4_M16_80 - - COPY4x4 - -.Lsgemm_tcopy_L4_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L4_M16_100 - - COPY2x4 - - -.Lsgemm_tcopy_L4_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L4_M16_END - - COPY1x4 - - -.Lsgemm_tcopy_L4_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L2_BEGIN: - - tst M, #3 - ble .Lsgemm_tcopy_L999 - - tst M, #2 - ble .Lsgemm_tcopy_L1_BEGIN - -.Lsgemm_tcopy_L2_M16_BEGIN: - mov A01, A - add A02, A01, LDA - add A, A02, LDA - - mov B00, B - add B, B00, #128 // B = B + 2 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L2_M16_40 - - .align 5 -.Lsgemm_tcopy_L2_M16_20: - - COPY16x2 - - subs I , I , #1 - bne .Lsgemm_tcopy_L2_M16_20 - -.Lsgemm_tcopy_L2_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L2_M16_60 - - COPY8x2 - -.Lsgemm_tcopy_L2_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L2_M16_80 - - COPY4x2 - -.Lsgemm_tcopy_L2_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L2_M16_100 - - COPY2x2 - -.Lsgemm_tcopy_L2_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L2_M16_END - - COPY1x2 - -.Lsgemm_tcopy_L2_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L1_BEGIN: - - tst M, #1 - ble .Lsgemm_tcopy_L999 - - -.Lsgemm_tcopy_L1_M16_BEGIN: - - mov A01, A // A01 = A - mov B00, B - - asr I, N, #4 // I = M / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L1_M16_40 - - .align 5 -.Lsgemm_tcopy_L1_M16_20: - - COPY16x1 - - subs I , I , #1 - bne .Lsgemm_tcopy_L1_M16_20 - -.Lsgemm_tcopy_L1_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L1_M16_60 - - COPY8x1 - -.Lsgemm_tcopy_L1_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L1_M16_80 - - COPY4x1 - -.Lsgemm_tcopy_L1_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L1_M16_100 - - COPY2x1 - -.Lsgemm_tcopy_L1_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L1_M16_END - - COPY1x1 - - -.Lsgemm_tcopy_L1_M16_END: - -.Lsgemm_tcopy_L999: - mov x0, #0 // set return value - RESTORE_REGS - ret - - EPILOGUE - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x21 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + //prfm PSTL1KEEP, [B00, M8] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] + add A05, A05, #64 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] + add A06, A06, #64 + + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] + add A07, A07, #64 + + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] + add A08, A08, #64 + + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 + +.endm + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] + add B01, B01, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + add B02, B02, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] + add B02, B02, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + add B03, B03, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B03] + add B03, B03, #16 + stp d6, d7, [B03] + add B03, B03, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + stp s4, s5, [B04] + add B04, B04, #8 + stp s6, s7, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ +.macro COPY16x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + + add B00, B00, M8 +.endm + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + + add B02, B02, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + + add B03, B03, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add B00, B00, M8 +.endm + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B02] + add B02, B02, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B03] + add B03, B03, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B04] + + add B04, B04, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B01] + + add B01, B01, #32 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B02] + + add B02, B02, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B03] + + add B03, B03, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B04] + + add B04, B04, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-16 + and B02 , N , #-8 + and B03 , N , #-4 + and B04 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + mul B04, B04, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + add B04 , B04, B + + lsl M8, M, #6 // M8 = M * 16 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #512 // B = B + 8 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M16_40 + + .align 5 +.Lsgemm_tcopy_L8_M16_20: + + COPY16x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M16_20 + +.Lsgemm_tcopy_L8_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L8_M16_60 + + COPY8x8 + +.Lsgemm_tcopy_L8_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L8_M16_80 + + COPY4x8 + +.Lsgemm_tcopy_L8_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M16_100 + + COPY2x8 + +.Lsgemm_tcopy_L8_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M16_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M16_END: + + subs J , J, #1 // j-- + bne .Lsgemm_tcopy_L8_M16_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #256 // B = B + 4 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M16_40 + + .align 5 +.Lsgemm_tcopy_L4_M16_20: + + COPY16x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M16_20 + +.Lsgemm_tcopy_L4_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L4_M16_60 + + COPY8x4 + +.Lsgemm_tcopy_L4_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L4_M16_80 + + COPY4x4 + +.Lsgemm_tcopy_L4_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M16_100 + + COPY2x4 + + +.Lsgemm_tcopy_L4_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L4_M16_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #128 // B = B + 2 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M16_40 + + .align 5 +.Lsgemm_tcopy_L2_M16_20: + + COPY16x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M16_20 + +.Lsgemm_tcopy_L2_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L2_M16_60 + + COPY8x2 + +.Lsgemm_tcopy_L2_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L2_M16_80 + + COPY4x2 + +.Lsgemm_tcopy_L2_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M16_100 + + COPY2x2 + +.Lsgemm_tcopy_L2_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M16_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #4 // I = M / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M16_40 + + .align 5 +.Lsgemm_tcopy_L1_M16_20: + + COPY16x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M16_20 + +.Lsgemm_tcopy_L1_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L1_M16_60 + + COPY8x1 + +.Lsgemm_tcopy_L1_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L1_M16_80 + + COPY4x1 + +.Lsgemm_tcopy_L1_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M16_100 + + COPY2x1 + +.Lsgemm_tcopy_L1_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M16_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M16_END: + +.Lsgemm_tcopy_L999: + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE + + diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c index e02e10c61..8c250d401 100644 --- a/kernel/mips/sdot_msa.c +++ b/kernel/mips/sdot_msa.c @@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; +#if defined(DSDOT) + v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7; + v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7; + v2f64 dot0 = {0, 0}; + v2f64 dot1 = {0, 0}; + v2f64 dot2 = {0, 0}; + v2f64 dot3 = {0, 0}; +#else v4f32 dot0 = {0, 0, 0, 0}; v4f32 dot1 = {0, 0, 0, 0}; v4f32 dot2 = {0, 0, 0, 0}; v4f32 dot3 = {0, 0, 0, 0}; +#endif if (n < 1) return (dot); @@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x_pref += 32; y_pref += 32; +#if defined(DSDOT) + /* Extend single precision to double precision */ + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + dvy4 = __msa_fexupr_d(vy4); + dvy5 = __msa_fexupr_d(vy5); + dvy6 = __msa_fexupr_d(vy6); + dvy7 = __msa_fexupr_d(vy7); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + vy4 = (v4f32)__msa_fexupl_d(vy4); + vy5 = (v4f32)__msa_fexupl_d(vy5); + vy6 = (v4f32)__msa_fexupl_d(vy6); + vy7 = (v4f32)__msa_fexupl_d(vy7); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + dvx4 = __msa_fexupr_d(vx4); + dvx5 = __msa_fexupr_d(vx5); + dvx6 = __msa_fexupr_d(vx6); + dvx7 = __msa_fexupr_d(vx7); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + vx4 = (v4f32)__msa_fexupl_d(vx4); + vx5 = (v4f32)__msa_fexupl_d(vx5); + vx6 = (v4f32)__msa_fexupl_d(vx6); + vx7 = (v4f32)__msa_fexupl_d(vx7); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += (dvy4 * dvx4); + dot1 += (dvy5 * dvx5); + dot2 += (dvy6 * dvx6); + dot3 += (dvy7 * dvx7); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); + dot0 += ((v2f64)vy4 * (v2f64)vx4); + dot1 += ((v2f64)vy5 * (v2f64)vx5); + dot2 += ((v2f64)vy6 * (v2f64)vx6); + dot3 += ((v2f64)vy7 * (v2f64)vx7); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); @@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot1 += (vy5 * vx5); dot2 += (vy6 * vx6); dot3 += (vy7 * vx7); +#endif } if (n & 31) @@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); +#endif } if (n & 8) @@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP2_INC(x, 4, vx0, vx1); LD_SP2_INC(y, 4, vy0, vy1); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); +#endif } if (n & 4) @@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vx0 = LD_SP(x); x += 4; vy0 = LD_SP(y); y += 4; +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + vy0 = (v4f32)__msa_fexupl_d(vy0); + dvx0 = __msa_fexupr_d(vx0); + vx0 = (v4f32)__msa_fexupl_d(vx0); + dot0 += (dvy0 * dvx0); + dot0 += ((v2f64)vy0 * (v2f64)vx0); +#else dot0 += (vy0 * vx0); +#endif } if (n & 2) @@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } } @@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += dot0[0]; dot += dot0[1]; +#if !defined(DSDOT) dot += dot0[2]; dot += dot0[3]; +#endif } else { @@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(y, inc_y, y0, y1, y2, y3); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); + dot += ((double)y2 * (double)x2); + dot += ((double)y3 * (double)x3); +#else dot += (y0 * x0); dot += (y1 * x1); dot += (y2 * x2); dot += (y3 * x3); +#endif } if (n & 2) @@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } } diff --git a/kernel/mips64/KERNEL.MIPS64_GENERIC b/kernel/mips64/KERNEL.MIPS64_GENERIC new file mode 100644 index 000000000..33bcbeedd --- /dev/null +++ b/kernel/mips64/KERNEL.MIPS64_GENERIC @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S index 0ccc781e1..cd40414a2 100644 --- a/kernel/mips64/dnrm2.S +++ b/kernel/mips64/dnrm2.S @@ -90,7 +90,7 @@ //Init INF lui TEMP, 0x7FF0 dsll TEMP, TEMP, 32 - MTC1 TEMP, INF + MTC TEMP, INF LD a1, 0 * SIZE(X) daddiu N, N, -1 diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S index 4b5c2fa31..dfe17f3ef 100644 --- a/kernel/power/cgemm_kernel_power9.S +++ b/kernel/power/cgemm_kernel_power9.S @@ -1,293 +1,293 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - -#define alpha_r vs19 -#define alpha_i vs20 -#define save_permute_1 vs21 -#define permute_mask vs22 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define PRE r29 - -#define T12 r30 -#define T13 r31 - -#include "cgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_12, 0x0c0d0e0f1c1d1e1f -.equ save_permute_11, 0x0405060714151617 - - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - - - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) - - - -#ifdef TRMMKERNEL - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) -#endif - slwi LDC, LDC, ZBASE_SHIFT - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xscvdpspn alpha_i,vs2 - xxspltw alpha_r,alpha_r,0 - xxspltw alpha_i,alpha_i,0 -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - - - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - - - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - - - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - - - li r0,0 - li PRE,512 - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegsp alpha_r,alpha_r - xvnegsp alpha_i,alpha_i -#endif - - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - - /*mask is reverse permute so we have to make it inner permute */ - xxpermdi permute_mask, permute_mask, permute_mask,2 - -#include "cgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs19 +#define alpha_i vs20 +#define save_permute_1 vs21 +#define permute_mask vs22 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S index b4f937e90..a191219fa 100644 --- a/kernel/power/cgemm_logic_power9.S +++ b/kernel/power/cgemm_logic_power9.S @@ -1,2816 +1,2816 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define MY_ALIGN .align 3 -b CGEMM_L4 -/* MINI SUBROUTINES */ -/* 4x8 MAIN 128x+2 LOOP */ - - -CGEMM_L4x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x8_2 - MY_ALIGN -CGEMM_L4x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 -CGEMM_L4x8_K128: -/*----------------------------------------*/ - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_L2 128,64,31,0 - KERNEL4x8_L2 128,64,32,0 - KERNEL4x8_L2 128,64,33,0 - KERNEL4x8_L2 128,64,34,0 - KERNEL4x8_L2 128,64,35,0 - KERNEL4x8_L2 128,64,36,0 - KERNEL4x8_L2 128,64,37,0 - KERNEL4x8_L2 128,64,38,0 - KERNEL4x8_L2 128,64,39,0 - KERNEL4x8_L2 128,64,40,0 - KERNEL4x8_L2 128,64,41,0 - KERNEL4x8_L2 128,64,42,0 - KERNEL4x8_L2 128,64,43,0 - KERNEL4x8_L2 128,64,44,0 - KERNEL4x8_L2 128,64,45,0 - KERNEL4x8_L2 128,64,46,0 - KERNEL4x8_L2 128,64,47,0 - KERNEL4x8_L2 128,64,48,0 - KERNEL4x8_L2 128,64,49,0 - KERNEL4x8_L2 128,64,50,0 - KERNEL4x8_L2 128,64,51,0 - KERNEL4x8_L2 128,64,52,0 - KERNEL4x8_L2 128,64,53,0 - KERNEL4x8_L2 128,64,54,0 - KERNEL4x8_L2 128,64,55,0 - KERNEL4x8_L2 128,64,56,0 - KERNEL4x8_L2 128,64,57,0 - KERNEL4x8_L2 128,64,58,0 - KERNEL4x8_L2 128,64,59,0 - KERNEL4x8_L2 128,64,60,0 - KERNEL4x8_L2 128,64,61,0 - KERNEL4x8_L2 128,64,62,0 - KERNEL4x8_L2 128,64,63,1 - bdnz CGEMM_L4x8_LOOP - MY_ALIGN -CGEMM_L4x8_LOOP_END: -/*----------------------------------------*/ - END4x8_2 - blr - MY_ALIGN - - -CGEMM_4x8_L64_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_E2 128,64,31,1 - blr - MY_ALIGN - - -CGEMM_4x8_L32_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_E2 128,64,15,1 - blr - MY_ALIGN - - -CGEMM_4x8_L16_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_E2 128,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x4_2 - MY_ALIGN -CGEMM_L4x4_LOOP: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,0,0 -CGEMM_L4x4_K32: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_L2 64,64,7,0 - KERNEL4x4_L2 64,64,8,0 - KERNEL4x4_L2 64,64,9,0 - KERNEL4x4_L2 64,64,10,0 - KERNEL4x4_L2 64,64,11,0 - KERNEL4x4_L2 64,64,12,0 - KERNEL4x4_L2 64,64,13,0 - KERNEL4x4_L2 64,64,14,0 - KERNEL4x4_L2 64,64,15,1 - bdnz CGEMM_L4x4_LOOP - MY_ALIGN -CGEMM_L4x4_LOOP_END: -/*----------------------------------------*/ - END4x4_2 - blr - MY_ALIGN - - -CGEMM_4x4_L16_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_E2 64,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_L8_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_E2 64,64,3,1 - blr - - -CGEMM_4x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x2_2 - MY_ALIGN -CGEMM_L4x2_LOOP: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,0,0 -CGEMM_L4x2_K32: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_L2 32,64,7,0 - KERNEL4x2_L2 32,64,8,0 - KERNEL4x2_L2 32,64,9,0 - KERNEL4x2_L2 32,64,10,0 - KERNEL4x2_L2 32,64,11,0 - KERNEL4x2_L2 32,64,12,0 - KERNEL4x2_L2 32,64,13,0 - KERNEL4x2_L2 32,64,14,0 - KERNEL4x2_L2 32,64,15,1 - bdnz CGEMM_L4x2_LOOP - MY_ALIGN - - -CGEMM_L4x2_LOOP_END: -/*----------------------------------------*/ - END4x2_2 - blr - MY_ALIGN -CGEMM_4x2_L16_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_E2 32,64,7,1 - blr - MY_ALIGN -CGEMM_4x2_L8_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_E2 32,64,3,1 - blr - - -CGEMM_4x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x1_2 - MY_ALIGN -CGEMM_L4x1_LOOP: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,0,0 -CGEMM_L4x1_K32: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_L2 16,64,7,0 - KERNEL4x1_L2 16,64,8,0 - KERNEL4x1_L2 16,64,9,0 - KERNEL4x1_L2 16,64,10,0 - KERNEL4x1_L2 16,64,11,0 - KERNEL4x1_L2 16,64,12,0 - KERNEL4x1_L2 16,64,13,0 - KERNEL4x1_L2 16,64,14,0 - KERNEL4x1_L2 16,64,15,1 - bdnz CGEMM_L4x1_LOOP - MY_ALIGN -CGEMM_L4x1_LOOP_END: -/*----------------------------------------*/ - END4x1_2 - blr - - MY_ALIGN -CGEMM_4x1_L16_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_E2 16,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x1_L8_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_E2 16,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L4: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 2 - ble CGEMM_L4_END - - -CGEMM_L4_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 2 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L4x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L4x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO4x8 - ble CGEMM_L4x8_SUB0 - bl CGEMM_L4x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L4x8_SAVE - b CGEMM_L4x8_SUB2 - - -CGEMM_L4x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP4x8_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD4x8O 64,32 - END4x8_WITHOUT_ADD - LOAD4x8_2O 128, 64 - mtctr T8 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - CMP4x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L4x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD4x8_2O 128,64 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - MY_ALIGN - - -CGEMM_L4x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L4x8_SUB2_32 - bl CGEMM_4x8_L64_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L4x8_SUB2_16 - bl CGEMM_4x8_L32_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x8_SUB2_8 - bl CGEMM_4x8_L16_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x8_SUB2_4 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_L2 128,64, 1,0 - KERNEL4x8_L2 128,64, 2,0 - KERNEL4x8_E2 128,64, 3,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x8_SUB2_2 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_E2 128,64, 1,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x8_SUB2_1 - LOAD4x8_2 - KERNEL4x8_E2 128,64, 0,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x8_SAVE - KERNEL4x8 - - MY_ALIGN -CGEMM_L4x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 -#endif - bgt CGEMM_L4x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END - b CGEMM_L4x4_BEGIN - MY_ALIGN - - -CGEMM_L4x8_END: -/*----------------------------------------*/ - - -CGEMM_L4x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x4 - ble CGEMM_L4x4_SUB0 - bl CGEMM_4x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x4_SAVE - b CGEMM_L4x4_SUB2 - - -CGEMM_L4x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x4_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD4x4O 32,32 - END4x4_WITHOUT_ADD - LOAD4x4_2O 64, 64 - mtctr T8 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - CMP4x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD4x4_2O 64,64 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x4_SUB2_8 - bl CGEMM_4x4_L16_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x4_SUB2_4 - bl CGEMM_4x4_L8_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x4_SUB2_2 - LOAD4x4_2 - KERNEL4x4_L2 64,64, 0,0 - KERNEL4x4_E2 64,64, 1,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x4_SUB2_1 - LOAD4x4_2 - KERNEL4x4_E2 64,64, 0,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x4_SAVE - KERNEL4x4 - - -CGEMM_L4x4_SAVE: -/*----------------------------------------*/ - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 -#endif - - -CGEMM_L4x4_END: -/*----------------------------------------*/ - - -CGEMM_L4x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x2 - ble CGEMM_L4x2_SUB0 - bl CGEMM_4x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x2_SAVE - b CGEMM_L4x2_SUB2 - - -CGEMM_L4x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x2_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD4x2O 16,32 - END4x2_WITHOUT_ADD - LOAD4x2_2O 32, 64 - mtctr T8 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - CMP4x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD4x2_2O 32,64 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x2_SUB2_8 - bl CGEMM_4x2_L16_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x2_SUB2_4 - bl CGEMM_4x2_L8_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x2_SUB2_2 - LOAD4x2_2 - KERNEL4x2_L2 32,64, 0,0 - KERNEL4x2_E2 32,64, 1,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x2_SUB2_1 - LOAD4x2_2 - KERNEL4x2_E2 32,64, 0,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -CGEMM_L4x2_SAVE: -/*----------------------------------------*/ - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 -#endif - - -CGEMM_L4x2_END: -/*----------------------------------------*/ - - -CGEMM_L4x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x1 - ble CGEMM_L4x1_SUB0 - bl CGEMM_4x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x1_SAVE - b CGEMM_L4x1_SUB2 - - -CGEMM_L4x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x1_32K - addi BO,BO,-32 - addi AO,AO,-8 - LOAD4x1O 8,32 - END4x1_WITHOUT_ADD - LOAD4x1_2O 16, 64 - mtctr T8 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - CMP4x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-16 - LOAD4x1_2O 16,64 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x1_SUB2_8 - bl CGEMM_4x1_L16_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x1_SUB2_4 - bl CGEMM_4x1_L8_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x1_SUB2_2 - LOAD4x1_2 - KERNEL4x1_L2 16,64, 0,0 - KERNEL4x1_E2 16,64, 1,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x1_SUB2_1 - LOAD4x1_2 - KERNEL4x1_E2 16,64, 0,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -CGEMM_L4x1_SAVE: -/*----------------------------------------*/ - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 -#endif - - -CGEMM_L4x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - bgt CGEMM_L4_BEGIN - - -CGEMM_L4_END: - -b CGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -CGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -CGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 -CGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_L2 128,32,31,0 - KERNEL2x8_L2 128,32,32,0 - KERNEL2x8_L2 128,32,33,0 - KERNEL2x8_L2 128,32,34,0 - KERNEL2x8_L2 128,32,35,0 - KERNEL2x8_L2 128,32,36,0 - KERNEL2x8_L2 128,32,37,0 - KERNEL2x8_L2 128,32,38,0 - KERNEL2x8_L2 128,32,39,0 - KERNEL2x8_L2 128,32,40,0 - KERNEL2x8_L2 128,32,41,0 - KERNEL2x8_L2 128,32,42,0 - KERNEL2x8_L2 128,32,43,0 - KERNEL2x8_L2 128,32,44,0 - KERNEL2x8_L2 128,32,45,0 - KERNEL2x8_L2 128,32,46,0 - KERNEL2x8_L2 128,32,47,0 - KERNEL2x8_L2 128,32,48,0 - KERNEL2x8_L2 128,32,49,0 - KERNEL2x8_L2 128,32,50,0 - KERNEL2x8_L2 128,32,51,0 - KERNEL2x8_L2 128,32,52,0 - KERNEL2x8_L2 128,32,53,0 - KERNEL2x8_L2 128,32,54,0 - KERNEL2x8_L2 128,32,55,0 - KERNEL2x8_L2 128,32,56,0 - KERNEL2x8_L2 128,32,57,0 - KERNEL2x8_L2 128,32,58,0 - KERNEL2x8_L2 128,32,59,0 - KERNEL2x8_L2 128,32,60,0 - KERNEL2x8_L2 128,32,61,0 - KERNEL2x8_L2 128,32,62,0 - KERNEL2x8_L2 128,32,63,1 - bdnz CGEMM_L2x8_LOOP - MY_ALIGN -CGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -CGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_E2 128,32,31,1 - blr - MY_ALIGN - - -CGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_E2 128,32,15,1 - blr - MY_ALIGN - - -CGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_E2 128,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -CGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,0,0 -CGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_L2 64,32,7,0 - KERNEL2x4_L2 64,32,8,0 - KERNEL2x4_L2 64,32,9,0 - KERNEL2x4_L2 64,32,10,0 - KERNEL2x4_L2 64,32,11,0 - KERNEL2x4_L2 64,32,12,0 - KERNEL2x4_L2 64,32,13,0 - KERNEL2x4_L2 64,32,14,0 - KERNEL2x4_L2 64,32,15,1 - bdnz CGEMM_L2x4_LOOP - MY_ALIGN -CGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -CGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_E2 64,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_E2 64,32,3,1 - blr - - -CGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -CGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,0,0 -CGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_L2 32,32,7,0 - KERNEL2x2_L2 32,32,8,0 - KERNEL2x2_L2 32,32,9,0 - KERNEL2x2_L2 32,32,10,0 - KERNEL2x2_L2 32,32,11,0 - KERNEL2x2_L2 32,32,12,0 - KERNEL2x2_L2 32,32,13,0 - KERNEL2x2_L2 32,32,14,0 - KERNEL2x2_L2 32,32,15,1 - bdnz CGEMM_L2x2_LOOP - MY_ALIGN - - -CGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -CGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_E2 32,32,7,1 - blr - MY_ALIGN -CGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_E2 32,32,3,1 - blr - - -CGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -CGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,0,0 -CGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_L2 16,32,7,0 - KERNEL2x1_L2 16,32,8,0 - KERNEL2x1_L2 16,32,9,0 - KERNEL2x1_L2 16,32,10,0 - KERNEL2x1_L2 16,32,11,0 - KERNEL2x1_L2 16,32,12,0 - KERNEL2x1_L2 16,32,13,0 - KERNEL2x1_L2 16,32,14,0 - KERNEL2x1_L2 16,32,15,1 - bdnz CGEMM_L2x1_LOOP - MY_ALIGN -CGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -CGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_E2 16,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_E2 16,32,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L2: -/*----------------------------------------*/ - - andi. J, N, 2 - ble CGEMM_L2_END - - -CGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble CGEMM_L2x8_SUB0 - bl CGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L2x8_SAVE - b CGEMM_L2x8_SUB2 - - -CGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD2x8O 64,16 - END2x8_WITHOUT_ADD - LOAD2x8_2O 128, 32 - mtctr T8 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8_2O 128,32 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - MY_ALIGN - - -CGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L2x8_SUB2_32 - bl CGEMM_2x8_L64_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L2x8_SUB2_16 - bl CGEMM_2x8_L32_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x8_SUB2_8 - bl CGEMM_2x8_L16_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_L2 128,32, 1,0 - KERNEL2x8_L2 128,32, 2,0 - KERNEL2x8_E2 128,32, 3,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_E2 128,32, 1,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 128,32, 0,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -CGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt CGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END - b CGEMM_L2x4_BEGIN - MY_ALIGN - - -CGEMM_L2x8_END: -/*----------------------------------------*/ - - -CGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble CGEMM_L2x4_SUB0 - bl CGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x4_SAVE - b CGEMM_L2x4_SUB2 - - -CGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD2x4O 32,16 - END2x4_WITHOUT_ADD - LOAD2x4_2O 64, 32 - mtctr T8 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4_2O 64,32 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x4_SUB2_8 - bl CGEMM_2x4_L16_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x4_SUB2_4 - bl CGEMM_2x4_L8_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 64,32, 0,0 - KERNEL2x4_E2 64,32, 1,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 64,32, 0,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x4_SAVE - KERNEL2x4 - - -CGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -CGEMM_L2x4_END: -/*----------------------------------------*/ - - -CGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble CGEMM_L2x2_SUB0 - bl CGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x2_SAVE - b CGEMM_L2x2_SUB2 - - -CGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD2x2O 16,16 - END2x2_WITHOUT_ADD - LOAD2x2_2O 32, 32 - mtctr T8 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2_2O 32,32 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x2_SUB2_8 - bl CGEMM_2x2_L16_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x2_SUB2_4 - bl CGEMM_2x2_L8_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 32,32, 0,0 - KERNEL2x2_E2 32,32, 1,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 32,32, 0,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -CGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -CGEMM_L2x2_END: -/*----------------------------------------*/ - - -CGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble CGEMM_L2x1_SUB0 - bl CGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x1_SAVE - b CGEMM_L2x1_SUB2 - - -CGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-16 - addi AO,AO,-8 - LOAD2x1O 8,16 - END2x1_WITHOUT_ADD - LOAD2x1_2O 16, 32 - mtctr T8 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1_2O 16,32 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x1_SUB2_8 - bl CGEMM_2x1_L16_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x1_SUB2_4 - bl CGEMM_2x1_L8_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 16,32, 0,0 - KERNEL2x1_E2 16,32, 1,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 16,32, 0,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -CGEMM_L2x1_SAVE: -/*----------------------------------------*/ - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -CGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 4 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - -CGEMM_L2_END: - - -b CGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -CGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -CGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 -CGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_L2 128,16,31,0 - KERNEL1x8_L2 128,16,32,0 - KERNEL1x8_L2 128,16,33,0 - KERNEL1x8_L2 128,16,34,0 - KERNEL1x8_L2 128,16,35,0 - KERNEL1x8_L2 128,16,36,0 - KERNEL1x8_L2 128,16,37,0 - KERNEL1x8_L2 128,16,38,0 - KERNEL1x8_L2 128,16,39,0 - KERNEL1x8_L2 128,16,40,0 - KERNEL1x8_L2 128,16,41,0 - KERNEL1x8_L2 128,16,42,0 - KERNEL1x8_L2 128,16,43,0 - KERNEL1x8_L2 128,16,44,0 - KERNEL1x8_L2 128,16,45,0 - KERNEL1x8_L2 128,16,46,0 - KERNEL1x8_L2 128,16,47,0 - KERNEL1x8_L2 128,16,48,0 - KERNEL1x8_L2 128,16,49,0 - KERNEL1x8_L2 128,16,50,0 - KERNEL1x8_L2 128,16,51,0 - KERNEL1x8_L2 128,16,52,0 - KERNEL1x8_L2 128,16,53,0 - KERNEL1x8_L2 128,16,54,0 - KERNEL1x8_L2 128,16,55,0 - KERNEL1x8_L2 128,16,56,0 - KERNEL1x8_L2 128,16,57,0 - KERNEL1x8_L2 128,16,58,0 - KERNEL1x8_L2 128,16,59,0 - KERNEL1x8_L2 128,16,60,0 - KERNEL1x8_L2 128,16,61,0 - KERNEL1x8_L2 128,16,62,0 - KERNEL1x8_L2 128,16,63,1 - bdnz CGEMM_L1x8_LOOP - MY_ALIGN -CGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -CGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_E2 128,16,31,1 - blr - MY_ALIGN - - -CGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_E2 128,16,15,1 - blr - MY_ALIGN - - -CGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_E2 128,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN -CGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,0,0 -CGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_L2 64,16,7,0 - KERNEL1x4_L2 64,16,8,0 - KERNEL1x4_L2 64,16,9,0 - KERNEL1x4_L2 64,16,10,0 - KERNEL1x4_L2 64,16,11,0 - KERNEL1x4_L2 64,16,12,0 - KERNEL1x4_L2 64,16,13,0 - KERNEL1x4_L2 64,16,14,0 - KERNEL1x4_L2 64,16,15,1 - bdnz CGEMM_L1x4_LOOP - MY_ALIGN -CGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -CGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_E2 64,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_E2 64,16,3,1 - blr - - -CGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN -CGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,0,0 -CGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_L2 32,16,7,0 - KERNEL1x2_L2 32,16,8,0 - KERNEL1x2_L2 32,16,9,0 - KERNEL1x2_L2 32,16,10,0 - KERNEL1x2_L2 32,16,11,0 - KERNEL1x2_L2 32,16,12,0 - KERNEL1x2_L2 32,16,13,0 - KERNEL1x2_L2 32,16,14,0 - KERNEL1x2_L2 32,16,15,1 - bdnz CGEMM_L1x2_LOOP - MY_ALIGN - - -CGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN -CGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_E2 32,16,7,1 - blr - MY_ALIGN -CGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_E2 32,16,3,1 - blr - - -CGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN -CGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,0,0 -CGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_L2 16,16,7,0 - KERNEL1x1_L2 16,16,8,0 - KERNEL1x1_L2 16,16,9,0 - KERNEL1x1_L2 16,16,10,0 - KERNEL1x1_L2 16,16,11,0 - KERNEL1x1_L2 16,16,12,0 - KERNEL1x1_L2 16,16,13,0 - KERNEL1x1_L2 16,16,14,0 - KERNEL1x1_L2 16,16,15,1 - bdnz CGEMM_L1x1_LOOP - MY_ALIGN -CGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - - MY_ALIGN -CGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_E2 16,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_E2 16,16,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L1: -/*----------------------------------------*/ - - andi. J, N, 1 - ble CGEMM_L1_END - -CGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble CGEMM_L1x8_SUB0 - bl CGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L1x8_SAVE - b CGEMM_L1x8_SUB2 - - -CGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-8 - addi AO,AO,-64 - LOAD1x8O 64,8 - END1x8_WITHOUT_ADD - LOAD1x8_2O 128, 16 - mtctr T8 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8_2O 128,16 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - MY_ALIGN - - -CGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L1x8_SUB2_32 - bl CGEMM_1x8_L64_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L1x8_SUB2_16 - bl CGEMM_1x8_L32_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x8_SUB2_8 - bl CGEMM_1x8_L16_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_L2 128,16, 1,0 - KERNEL1x8_L2 128,16, 2,0 - KERNEL1x8_E2 128,16, 3,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_E2 128,16, 1,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 128,16, 0,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x8_SAVE - KERNEL1x8 - - MY_ALIGN -CGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt CGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END - b CGEMM_L1x4_BEGIN - MY_ALIGN - - -CGEMM_L1x8_END: -/*----------------------------------------*/ - - -CGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x4 - ble CGEMM_L1x4_SUB0 - bl CGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x4_SAVE - b CGEMM_L1x4_SUB2 - - -CGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-8 - addi AO,AO,-32 - LOAD1x4O 32,8 - END1x4_WITHOUT_ADD - LOAD1x4_2O 64, 16 - mtctr T8 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4_2O 64,16 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x4_SUB2_8 - bl CGEMM_1x4_L16_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x4_SUB2_4 - bl CGEMM_1x4_L8_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 64,16, 0,0 - KERNEL1x4_E2 64,16, 1,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 64,16, 0,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x4_SAVE - KERNEL1x4 - - -CGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -CGEMM_L1x4_END: -/*----------------------------------------*/ - - -CGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x2 - ble CGEMM_L1x2_SUB0 - bl CGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x2_SAVE - b CGEMM_L1x2_SUB2 - - -CGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-8 - addi AO,AO,-16 - LOAD1x2O 16,8 - END1x2_WITHOUT_ADD - LOAD1x2_2O 32, 16 - mtctr T8 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2_2O 32,16 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x2_SUB2_8 - bl CGEMM_1x2_L16_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x2_SUB2_4 - bl CGEMM_1x2_L8_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 32,16, 0,0 - KERNEL1x2_E2 32,16, 1,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 32,16, 0,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x2_SAVE - KERNEL1x2 - - MY_ALIGN -CGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -CGEMM_L1x2_END: -/*----------------------------------------*/ - - -CGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x1 - ble CGEMM_L1x1_SUB0 - bl CGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x1_SAVE - b CGEMM_L1x1_SUB2 - - -CGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-8 - addi AO,AO,-8 - LOAD1x1O 8,8 - END1x1_WITHOUT_ADD - LOAD1x1_2O 16, 16 - mtctr T8 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1_2O 16,16 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x1_SUB2_8 - bl CGEMM_1x1_L16_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x1_SUB2_4 - bl CGEMM_1x1_L8_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 16,16, 0,0 - KERNEL1x1_E2 16,16, 1,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 16,16, 0,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x1_SAVE - KERNEL1x1 - - MY_ALIGN -CGEMM_L1x1_SAVE: -/*----------------------------------------*/ - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -CGEMM_L1x1_END: -/*----------------------------------------*/ - slwi T1, K, 3 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - -CGEMM_L1_END: - - - - +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S index a256e1a01..be2b74f01 100644 --- a/kernel/power/cgemm_macros_power9.S +++ b/kernel/power/cgemm_macros_power9.S @@ -1,3019 +1,3019 @@ - -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - - -.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - -/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmulsp \VSOUT1,\VSINII, alpha_i - xvmulsp \VSOUT2,\VSINRR, alpha_i -.endm - -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubasp \VSOUT1,\VSINRR, alpha_r - xvmaddasp \VSOUT2,\VSINII, alpha_r -.endm - -/* macros for N=4 and M=8 -**********************************************************************************************/ - -.macro Zero4x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD4x8 - LOAD4x8O 0,0 -.endm - - -.macro LOAD4x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_NORMAL - END4x8 AO,BO,64,32 -.endm - - -.macro END4x8_WITHOUT_ADD - END4x8 AO,BO,0,0 -.endm - - -.macro END4x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.endm - - -.macro LOAD4x8_2 - LOAD4x8_2O 0,0 -.endm - - -.macro LOAD4x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_2 - /*for load2 offset will be 128 and 64*/ - KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 -.endm - - -.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL4x8 - LOAD4x8 - END4x8 AO, BO, 64,32 -.endm - - -.macro SAVE4x8 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - xxperm vs2,vs50,permute_mask - xxperm vs6,vs58,permute_mask - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - xxperm vs3,vs51,permute_mask - xxperm vs7,vs59,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 - xxperm vs10,vs54,permute_mask - xxperm vs14,vs62,permute_mask - AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 - xxperm vs11,vs55,permute_mask - xxperm vs15,vs63,permute_mask - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - #ifndef TRMMKERNEL - lxv vs32 , 0(T2) - lxv vs40 , 16(T2) -#endif - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs33 , 32(T2) - lxv vs41 , 48(T2) -#endif - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 -#ifndef TRMMKERNEL - lxv vs34 , 0(T3) - lxv vs42 , 16(T3) -#endif - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs35 , 32(T3) - lxv vs43 , 48(T3) -#endif - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - MULT_APLHA_PART1 vs48,vs56,vs0,vs1 - MULT_APLHA_PART1 vs49,vs57,vs2,vs3 - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - MULT_APLHA_PART1 vs50,vs58,vs4,vs5 - MULT_APLHA_PART1 vs51,vs59,vs6,vs7 - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - MULT_APLHA_PART2 vs48,vs56,vs0,vs1 - MULT_APLHA_PART2 vs49,vs57,vs2,vs3 - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - MULT_APLHA_PART2 vs50,vs58,vs4,vs5 - MULT_APLHA_PART2 vs51,vs59,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs8,vs9 - MULT_APLHA_PART1 vs53,vs61,vs10,vs11 - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - MULT_APLHA_PART1 vs54,vs62,vs12,vs13 - MULT_APLHA_PART1 vs55,vs63,vs14,vs15 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - MULT_APLHA_PART2 vs52,vs60,vs8,vs9 - MULT_APLHA_PART2 vs53,vs61,vs10,vs11 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - MULT_APLHA_PART2 vs54,vs62,vs12,vs13 - MULT_APLHA_PART2 vs55,vs63,vs14,vs15 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs32,vs32,vs1 - xvaddsp vs40,vs40,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs33,vs33,vs5 - xvaddsp vs41,vs41,vs7 - xvaddsp vs34,vs34,vs9 - xvaddsp vs42,vs42,vs11 - xvaddsp vs35,vs35,vs13 - xvaddsp vs43,vs43,vs15 -#else - xxpermdi vs32,vs8,vs0,2 - xxpermdi vs40,vs10,vs2,2 - xxpermdi vs33,vs12,vs4,2 - xxpermdi vs41,vs14,vs6,2 - xxpermdi vs34,vs0,vs8,2 - xxpermdi vs42,vs2,vs10,2 - xxpermdi vs35,vs4,vs12,2 - xxpermdi vs43,vs6,vs14,2 -#endif - stxv vs32 , 0(T2) - stxv vs40 , 16(T2) - stxv vs33 , 32(T2) - stxv vs41 , 48(T2) - stxv vs34 , 0(T3) - stxv vs42 , 16(T3) - stxv vs35 , 32(T3) - stxv vs43 , 48(T3) - addi CO, CO, 64 -.endm - -/* macros for N=4 and M=4 -**********************************************************************************************/ - -.macro Zero4x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endm - - -.macro LOAD4x4 - LOAD4x4O 0,0 -.endm - - -.macro LOAD4x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_NORMAL - END4x4 AO,BO,32,32 -.endm - - -.macro END4x4_WITHOUT_ADD - END4x4 AO,BO,0,0 -.endm - - -.macro END4x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.endm - - -.macro LOAD4x4_2 - LOAD4x4_2O 0,0 -.endm - - -.macro LOAD4x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_2 - /*for load2 offset will be 64 and 64*/ - KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 -.endm - - -.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x4 - LOAD4x4 - END4x4 AO, BO, 32,32 -.endm - - -.macro SAVE4x4 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - #ifndef TRMMKERNEL - lxv vs28 , 0(T2) - lxv vs29 , 16(T2) -#endif -#ifndef TRMMKERNEL - lxv vs30 , 0(T3) - lxv vs31 , 16(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs48,vs56,vs4,vs5 - MULT_APLHA_PART1 vs49,vs57,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs12,vs13 - MULT_APLHA_PART1 vs53,vs61,vs14,vs15 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs48,vs56,vs4,vs5 - MULT_APLHA_PART2 vs49,vs57,vs6,vs7 - MULT_APLHA_PART2 vs52,vs60,vs12,vs13 - MULT_APLHA_PART2 vs53,vs61,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 - xvaddsp vs28,vs28,vs5 - xvaddsp vs29,vs29,vs7 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 - xxpermdi vs28,vs12,vs4,2 - xxpermdi vs29,vs14,vs6,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - stxv vs28 , 0(T2) - stxv vs29 , 16(T2) - stxv vs30 , 0(T3) - stxv vs31 , 16(T3) - addi CO, CO, 32 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD4x2 - LOAD4x2O 0,0 -.endm - - -.macro LOAD4x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_NORMAL - END4x2 AO,BO,16,32 -.endm - - -.macro END4x2_WITHOUT_ADD - END4x2 AO,BO,0,0 -.endm - - -.macro END4x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD4x2_2 - LOAD4x2_2O 0,0 -.endm - - -.macro LOAD4x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_2 - /*for load2 offset will be 32 and 64*/ - KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 -.endm - - -.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x2 - LOAD4x2 - END4x2 AO, BO, 16,32 -.endm - - -.macro SAVE4x2 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs25 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxv vs27 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs10,vs2,0 - xxpermdi vs3,vs0,vs8,3 - xxpermdi vs11,vs2,vs10,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 - xvaddsp vs25,vs25,vs3 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs10,vs2,0 - xxpermdi vs25,vs0,vs8,3 - xxpermdi vs27,vs2,vs10,3 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 0(T1) - stxv vs26 , 0(T2) - stxv vs27 , 0(T3) - addi CO, CO, 16 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD4x1 - LOAD4x1O 0,0 -.endm - - -.macro LOAD4x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END4x1_NORMAL - END4x1 AO,BO,8,32 -.endm - - -.macro END4x1_WITHOUT_ADD - END4x1 AO,BO,0,0 -.endm - - -.macro END4x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD4x1_2 - LOAD4x1_2O 0,0 -.endm - - -.macro LOAD4x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) -.endm - - -.macro END4x1_2 - /*for load2 offset will be 16 and 64*/ - KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 -.endm - - -.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x1 - LOAD4x1 - END4x1 AO, BO, 8,32 -.endm - - -.macro SAVE4x1 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxsd v6 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxsd v7 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - xxspltd vs9,vs2,0 - xxspltd vs11,vs2,1 - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 - xvaddsp vs38,vs38,vs9 - xvaddsp vs39,vs39,vs11 -#else - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 - xxspltd vs38,vs2,0 - xxspltd vs39,vs2,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - stxsd v6 , 0(T2) - stxsd v7 , 0(T3) - addi CO, CO, 8 -.endm - -/* macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,64,16 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_2 - /*for load2 offset will be 128 and 32*/ - KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 -.endm - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 64,16 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - addi CO, CO, 64 -.endm - -/* macros for N=2 and M=4 -**********************************************************************************************/ - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,32,16 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_2 - /*for load2 offset will be 64 and 32*/ - KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 -.endm - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 32,16 -.endm - - -.macro SAVE2x4 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - addi CO, CO, 32 -.endm - -/* macros for N=2 and M=2 -**********************************************************************************************/ - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs36, vs36, vs36 - xxlxor vs40, vs40, vs40 - xxlxor vs44, vs44, vs44 -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,16,16 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_2 - /*for load2 offset will be 32 and 32*/ - KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 -.endm - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs44, vs0,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 16,16 -.endm - - -.macro SAVE2x2 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs8,vs9, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs0,vs8,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs0,vs8,3 -#endif - stxv vs24 , 0(CO) - stxv vs26 , 0(T1) - addi CO, CO, 16 -.endm - -/* macros for N=2 and M=1 -**********************************************************************************************/ - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,8,16 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_2 - /*for load2 offset will be 16 and 32*/ - KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 -.endm - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 8,16 -.endm - - -.macro SAVE2x1 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - /*--v4==vs36 v5==vs37---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 -#else - /*--v4==vs36 v5==vs37---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - addi CO, CO, 8 -.endm - -/* macros for N=1 and M=8 -**********************************************************************************************/ - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,64,8 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_2 - /*for load2 offset will be 128 and 16*/ - KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 -.endm - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 64,8 -.endm - - -.macro SAVE1x8 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 - xxperm vs4,vs5, vs28 - xxperm vs6,vs7, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - xvaddsp vs26,vs26,vs4 - xvaddsp vs27,vs27,vs6 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) - stxv vs4 , 32(CO) - stxv vs6 , 48(CO) -#endif - addi CO, CO, 64 -.endm - -/* macros for N=1 and M=4 -**********************************************************************************************/ - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,32,8 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_2 - /*for load2 offset will be 64 and 16*/ - KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 -.endm - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 32,8 -.endm - - -.macro SAVE1x4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) -#endif - addi CO, CO, 32 -.endm - -/* macros for N=1 and M=2 -**********************************************************************************************/ - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,16,8 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs0, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_2 - /*for load2 offset will be 32 and 16*/ - KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 -.endm - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP4(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 16,8 -.endm - - -.macro SAVE1x2 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - stxv vs24 , 0(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) -#endif - addi CO, CO, 16 -.endm - -/* macros for N=1 and M=1 -**********************************************************************************************/ -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxsd v4, (\OffsetB+0)(BO) - lxsd v5, (\OffsetA+0)(AO) - xxperm vs38, vs36, permute_mask -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,8,8 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs37,vs36 - xvmaddasp vs40, vs37,vs38 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask -.endm - - -.macro END1x1_2 - /*for load2 offset will be 16 and 16*/ - KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 -.endm - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP2(\Index,\OffsetB)(\BREG) - lxv vs4, DISP2(\Index,\OffsetB)(\AREG) - xxperm vs10, vs8, permute_mask -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP2(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP2(\Index,16) -.endif - -.endif -.endm - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 8,8 -.endm - - -.macro SAVE1x1 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - /*aggregate x2*/ - xxpermdi vs33,vs32,vs32,2 - xxpermdi vs41,vs40,vs40,2 - xvaddsp vs32,vs32,vs33 - xvaddsp vs40,vs40,vs41 - - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs37,vs1 - MULT_APLHA_PART2 vs32,vs40,vs37,vs1 - -/* reconstruct r,i pairs*/ - xxperm vs37,vs1, vs28 - -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs36,vs36,vs37 - stxsd v4 , 0(CO) -#else - -/* vs37 is v5 */ - stxsd v5 , 0(CO) -#endif - addi CO, CO, 8 -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*8; -// ptrbb = bb + off*4; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+8; // number of values in A -// #else -// temp = off+4; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 8; // number of values in A -// #else -// temp -= 4; // number of values in B -// #endif -// ptrba += temp*8; -// ptrbb += temp*4; -// #endif - -// #ifdef LEFT -// off += 8; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif + +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmulsp \VSOUT1,\VSINII, alpha_i + xvmulsp \VSOUT2,\VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubasp \VSOUT1,\VSINRR, alpha_r + xvmaddasp \VSOUT2,\VSINII, alpha_r +.endm + +/* macros for N=4 and M=8 +**********************************************************************************************/ + +.macro Zero4x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD4x8 + LOAD4x8O 0,0 +.endm + + +.macro LOAD4x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_NORMAL + END4x8 AO,BO,64,32 +.endm + + +.macro END4x8_WITHOUT_ADD + END4x8 AO,BO,0,0 +.endm + + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.endm + + +.macro LOAD4x8_2 + LOAD4x8_2O 0,0 +.endm + + +.macro LOAD4x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 +.endm + + +.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64,32 +.endm + + +.macro SAVE4x8 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + xxperm vs2,vs50,permute_mask + xxperm vs6,vs58,permute_mask + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + xxperm vs3,vs51,permute_mask + xxperm vs7,vs59,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 + xxperm vs10,vs54,permute_mask + xxperm vs14,vs62,permute_mask + AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 + xxperm vs11,vs55,permute_mask + xxperm vs15,vs63,permute_mask + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + #ifndef TRMMKERNEL + lxv vs32 , 0(T2) + lxv vs40 , 16(T2) +#endif + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs33 , 32(T2) + lxv vs41 , 48(T2) +#endif + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 +#ifndef TRMMKERNEL + lxv vs34 , 0(T3) + lxv vs42 , 16(T3) +#endif + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs35 , 32(T3) + lxv vs43 , 48(T3) +#endif + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + MULT_APLHA_PART1 vs48,vs56,vs0,vs1 + MULT_APLHA_PART1 vs49,vs57,vs2,vs3 + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + MULT_APLHA_PART1 vs50,vs58,vs4,vs5 + MULT_APLHA_PART1 vs51,vs59,vs6,vs7 + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + MULT_APLHA_PART2 vs48,vs56,vs0,vs1 + MULT_APLHA_PART2 vs49,vs57,vs2,vs3 + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + MULT_APLHA_PART2 vs50,vs58,vs4,vs5 + MULT_APLHA_PART2 vs51,vs59,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs8,vs9 + MULT_APLHA_PART1 vs53,vs61,vs10,vs11 + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + MULT_APLHA_PART1 vs54,vs62,vs12,vs13 + MULT_APLHA_PART1 vs55,vs63,vs14,vs15 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + MULT_APLHA_PART2 vs52,vs60,vs8,vs9 + MULT_APLHA_PART2 vs53,vs61,vs10,vs11 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + MULT_APLHA_PART2 vs54,vs62,vs12,vs13 + MULT_APLHA_PART2 vs55,vs63,vs14,vs15 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs32,vs32,vs1 + xvaddsp vs40,vs40,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs33,vs33,vs5 + xvaddsp vs41,vs41,vs7 + xvaddsp vs34,vs34,vs9 + xvaddsp vs42,vs42,vs11 + xvaddsp vs35,vs35,vs13 + xvaddsp vs43,vs43,vs15 +#else + xxpermdi vs32,vs8,vs0,2 + xxpermdi vs40,vs10,vs2,2 + xxpermdi vs33,vs12,vs4,2 + xxpermdi vs41,vs14,vs6,2 + xxpermdi vs34,vs0,vs8,2 + xxpermdi vs42,vs2,vs10,2 + xxpermdi vs35,vs4,vs12,2 + xxpermdi vs43,vs6,vs14,2 +#endif + stxv vs32 , 0(T2) + stxv vs40 , 16(T2) + stxv vs33 , 32(T2) + stxv vs41 , 48(T2) + stxv vs34 , 0(T3) + stxv vs42 , 16(T3) + stxv vs35 , 32(T3) + stxv vs43 , 48(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro Zero4x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endm + + +.macro LOAD4x4 + LOAD4x4O 0,0 +.endm + + +.macro LOAD4x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_NORMAL + END4x4 AO,BO,32,32 +.endm + + +.macro END4x4_WITHOUT_ADD + END4x4 AO,BO,0,0 +.endm + + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.endm + + +.macro LOAD4x4_2 + LOAD4x4_2O 0,0 +.endm + + +.macro LOAD4x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 +.endm + + +.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32,32 +.endm + + +.macro SAVE4x4 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + #ifndef TRMMKERNEL + lxv vs28 , 0(T2) + lxv vs29 , 16(T2) +#endif +#ifndef TRMMKERNEL + lxv vs30 , 0(T3) + lxv vs31 , 16(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs48,vs56,vs4,vs5 + MULT_APLHA_PART1 vs49,vs57,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs12,vs13 + MULT_APLHA_PART1 vs53,vs61,vs14,vs15 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs48,vs56,vs4,vs5 + MULT_APLHA_PART2 vs49,vs57,vs6,vs7 + MULT_APLHA_PART2 vs52,vs60,vs12,vs13 + MULT_APLHA_PART2 vs53,vs61,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 + xvaddsp vs28,vs28,vs5 + xvaddsp vs29,vs29,vs7 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 + xxpermdi vs28,vs12,vs4,2 + xxpermdi vs29,vs14,vs6,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + stxv vs28 , 0(T2) + stxv vs29 , 16(T2) + stxv vs30 , 0(T3) + stxv vs31 , 16(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD4x2 + LOAD4x2O 0,0 +.endm + + +.macro LOAD4x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_NORMAL + END4x2 AO,BO,16,32 +.endm + + +.macro END4x2_WITHOUT_ADD + END4x2 AO,BO,0,0 +.endm + + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD4x2_2 + LOAD4x2_2O 0,0 +.endm + + +.macro LOAD4x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16,32 +.endm + + +.macro SAVE4x2 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs10,vs2,0 + xxpermdi vs3,vs0,vs8,3 + xxpermdi vs11,vs2,vs10,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 + xvaddsp vs25,vs25,vs3 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs10,vs2,0 + xxpermdi vs25,vs0,vs8,3 + xxpermdi vs27,vs2,vs10,3 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 0(T1) + stxv vs26 , 0(T2) + stxv vs27 , 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD4x1 + LOAD4x1O 0,0 +.endm + + +.macro LOAD4x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END4x1_NORMAL + END4x1 AO,BO,8,32 +.endm + + +.macro END4x1_WITHOUT_ADD + END4x1 AO,BO,0,0 +.endm + + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD4x1_2 + LOAD4x1_2O 0,0 +.endm + + +.macro LOAD4x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) +.endm + + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 +.endm + + +.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8,32 +.endm + + +.macro SAVE4x1 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + xxspltd vs9,vs2,0 + xxspltd vs11,vs2,1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 + xvaddsp vs38,vs38,vs9 + xvaddsp vs39,vs39,vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 + xxspltd vs38,vs2,0 + xxspltd vs39,vs2,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + stxsd v6 , 0(T2) + stxsd v7 , 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,64,16 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 +.endm + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64,16 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,32,16 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 +.endm + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32,16 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs36, vs36, vs36 + xxlxor vs40, vs40, vs40 + xxlxor vs44, vs44, vs44 +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,16,16 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 +.endm + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs44, vs0,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16,16 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs8,vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs0,vs8,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs0,vs8,3 +#endif + stxv vs24 , 0(CO) + stxv vs26 , 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,8,16 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8,16 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,64,8 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 +.endm + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + + +.macro SAVE1x8 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 + xxperm vs4,vs5, vs28 + xxperm vs6,vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + xvaddsp vs26,vs26,vs4 + xvaddsp vs27,vs27,vs6 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) + stxv vs4 , 32(CO) + stxv vs6 , 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,32,8 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 +.endm + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + + +.macro SAVE1x4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,16,8 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs0, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 +.endm + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP4(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + stxv vs24 , 0(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,8,8 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs37,vs36 + xvmaddasp vs40, vs37,vs38 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 +.endm + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index,\OffsetB)(\BREG) + lxv vs4, DISP2(\Index,\OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP2(\Index,16) +.endif + +.endif +.endm + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33,vs32,vs32,2 + xxpermdi vs41,vs40,vs40,2 + xvaddsp vs32,vs32,vs33 + xvaddsp vs40,vs40,vs41 + + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs37,vs1 + MULT_APLHA_PART2 vs32,vs40,vs37,vs1 + +/* reconstruct r,i pairs*/ + xxperm vs37,vs1, vs28 + +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36,vs36,vs37 + stxsd v4 , 0(CO) +#else + +/* vs37 is v5 */ + stxsd v5 , 0(CO) +#endif + addi CO, CO, 8 +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index 8663039c5..575847da2 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -1,597 +1,597 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/zgemv_n.c" -#else - -#include -#include -#include "common.h" -#include -#define NBMAX 1024 - - -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; - register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; - register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; - register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; - register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; - register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; - register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; - register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; - register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - register __vector float *vptr_a2 = (__vector float *) a2; - register __vector float *vptr_a3 = (__vector float *) a3; - BLASLONG i = 0; - BLASLONG i2=16; - for (;i< n * 8; i+=32,i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; - va0 = vec_perm(va0, va0,swap_mask); - va0_1 = vec_perm(va0_1, va0_1,swap_mask); - va1 = vec_perm(va1, va1,swap_mask); - va1_1 = vec_perm(va1_1, va1_1,swap_mask); - va2 = vec_perm(va2, va2,swap_mask); - va2_1 = vec_perm(va2_1, va2_1,swap_mask); - va3 = vec_perm(va3, va3,swap_mask); - va3_1 = vec_perm(va3_1, va3_1,swap_mask); - vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; - vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - register __vector float va1x = vec_perm(va1, va1,swap_mask); - register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); - vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) ap; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - vy_0 += va0*vx0_r + va0x*vx0_i; - vy_1 += va0_1*vx0_r + va0x_1*vx0_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } -} - - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i=0; - - - if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i +#include +#include "common.h" +#include +#define NBMAX 1024 + + +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; + register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; + register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; + register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; + register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; + register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; + register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; + register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; + register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + register __vector float *vptr_a2 = (__vector float *) a2; + register __vector float *vptr_a3 = (__vector float *) a3; + BLASLONG i = 0; + BLASLONG i2=16; + for (;i< n * 8; i+=32,i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; + va0 = vec_perm(va0, va0,swap_mask); + va0_1 = vec_perm(va0_1, va0_1,swap_mask); + va1 = vec_perm(va1, va1,swap_mask); + va1_1 = vec_perm(va1_1, va1_1,swap_mask); + va2 = vec_perm(va2, va2,swap_mask); + va2_1 = vec_perm(va2_1, va2_1,swap_mask); + va3 = vec_perm(va3, va3,swap_mask); + va3_1 = vec_perm(va3_1, va3_1,swap_mask); + vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; + vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + register __vector float va1x = vec_perm(va1, va1,swap_mask); + register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); + vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) ap; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } +} + + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i=0; + + + if (inc_dest != 2) { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* vptr_a2 = (__vector float*) a2; - __vector float* vptr_a3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - vtemp2_p += vx_0*va2 + vx_1*va2_1; - vtemp2_r += vxr_0*va2 + vxr_1*va2_1; - vtemp3_p += vx_0*va3 + vx_1*va3_1; - vtemp3_r += vxr_0*va3 + vxr_1*va3_1; - - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - -#endif - -} - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - - - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - - } -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - -#endif - -} - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) ap; - __vector float* v_x = (__vector float*) x; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - -#endif - - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i=0; - BLASLONG j=0; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - - xbuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return (0); - } - - if (m3 == 1) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); - -} -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else + +#include "common.h" + +#define NBMAX 1024 +#include +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* vptr_a2 = (__vector float*) a2; + __vector float* vptr_a3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + vtemp2_p += vx_0*va2 + vx_1*va2_1; + vtemp2_r += vxr_0*va2 + vxr_1*va2_1; + vtemp3_p += vx_0*va3 + vx_1*va3_1; + vtemp3_r += vxr_0*va3 + vxr_1*va3_1; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif + +} + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + + + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif + +} + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) ap; + __vector float* v_x = (__vector float*) x; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i=0; + BLASLONG j=0; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return (0); + } + + if (m3 == 1) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + return (0); + +} +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 84ba5d913..dbd7e3482 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -1,233 +1,233 @@ -/*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if defined(POWER8) || defined(POWER9) || defined(POWER10) -#if defined(__VEC__) || defined(__ALTIVEC__) - -static void crot_kernel_8 (long n, float *x, float *y, float c, float s) -{ - __vector float t0; - __vector float t1; - __vector float t2; - __vector float t3; - __vector float t4; - __vector float t5; - __vector float t6; - __vector float t7; - __asm__ - ( - "xscvdpspn 36, %x[cos] \n\t" // load c to all words - "xxspltw 36, 36, 0 \n\t" - "xscvdpspn 37, %x[sin] \n\t" // load s to all words - "xxspltw 37, 37, 0 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 64 \n\t" - "addi %[y_ptr], %[y_ptr], 64 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "ble two%= \n\t" - ".align 5 \n\t" - "one%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 128 \n\t" - "addi %[y_ptr], %[y_ptr], 128 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "bgt one%= \n\t" - "two%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] " - : - [mem_x] "+m" (*(float (*)[2*n])x), - [mem_y] "+m" (*(float (*)[2*n])y), - [temp_n] "+r" (n), - [x_ptr] "+&b" (x), - [y_ptr] "+&b" (y), - [x0] "=wa" (t0), - [x1] "=wa" (t2), - [x2] "=wa" (t1), - [x3] "=wa" (t3), - [x4] "=wa" (t4), - [x5] "=wa" (t5), - [x6] "=wa" (t6), - [x7] "=wa" (t7) - : - [cos] "f" (c), - [sin] "f" (s), - [i16] "b" (16), - [i32] "b" (32), - [i48] "b" (48) - : - "cr0", - "vs32","vs33","vs34","vs35","vs36","vs37", - "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", - "vs48","vs49","vs50","vs51" - ); -} - -#endif -#endif - - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { -#if defined(__VEC__) || defined(__ALTIVEC__) - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - crot_kernel_8(n1, x, y, c, s); - i=n1; - ix=2*n1; - } -#endif - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; - - } - - } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - } - return(0); -} - +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "ble two%= \n\t" + ".align 5 \n\t" + "one%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "bgt one%= \n\t" + "two%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x, y, c, s); + i=n1; + ix=2*n1; + } +#endif + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + } + return(0); +} + diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index 2fb1b27ef..86108f20c 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -1,249 +1,249 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld - - - - -#define STACKSIZE (512 ) -#define ALPHA_SP (296+192)(SP) -#define FZERO (304+192)(SP) - - - -#define M r3 -#define N r4 -#define K r5 - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs18 - -#define o0 0 - - -#define T4 r12 -#define T3 r11 -#define C4 r14 -#define o8 r15 -#define o24 r16 -#define C2 r17 -#define L r18 -#define T1 r19 -#define C3 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define o16 r27 -#define o32 r28 -#define o48 r29 - -#define PRE r30 -#define T2 r31 - -#include "dgemm_macros_power9.S" - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - li r0, 0 - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - - stfd f1, ALPHA_SP - stw r0, FZERO - - slwi LDC, LDC, BASE_SHIFT - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - - - cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 - - - - addi T1, SP, 296+192 - - - li PRE, 384 - li o8 , 8 - li o16, 16 - li o24, 24 - li o32, 32 - li o48, 48 - - - lxvdsx alpha_r, 0, T1 - -#include "dgemm_logic_power9.S" - -.L999: - addi r3, 0, 0 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld + + + + +#define STACKSIZE (512 ) +#define ALPHA_SP (296+192)(SP) +#define FZERO (304+192)(SP) + + + +#define M r3 +#define N r4 +#define K r5 + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs18 + +#define o0 0 + + +#define T4 r12 +#define T3 r11 +#define C4 r14 +#define o8 r15 +#define o24 r16 +#define C2 r17 +#define L r18 +#define T1 r19 +#define C3 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_power9.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + + stfd f1, ALPHA_SP + stw r0, FZERO + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + + + addi T1, SP, 296+192 + + + li PRE, 384 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + + lxvdsx alpha_r, 0, T1 + +#include "dgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S index 251839d19..a48bc685a 100644 --- a/kernel/power/dgemm_logic_power9.S +++ b/kernel/power/dgemm_logic_power9.S @@ -1,1981 +1,1981 @@ -/*************************************************************************** -Copyright (c) 2013-2019 The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#define MY_ALIGN .align 3 - -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 2 - ble LDGEMM_L4_END - -LDGEMM_L4_BEGIN: - - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LDGEMM_L4x16_END - - MY_ALIGN -LDGEMM_L4x16_BEGIN: - - li L, -128 - - - SAVE4x16_REGS - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - - and T1, CO, L - and T2, C2, L - and T3, C3, L - and T4, C4, L - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - - - addi T1, T1, 128 - addi T2, T2, 128 - addi T3, T3, 128 - addi T4, T4, 128 - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 - srawi. L, T3, 5 -#else - srawi. L, K, 5 -#endif - - ble LDGEMM_L4x16_SUB0 - - - MY_ALIGN -LDGEMM_L4x16_LOOP_START: - - li T2, 512 - - - LOAD4x16_1 - ##OffsetA=128 OffsetB=32 - addi AO,AO,2176 - # addi BO,BO,32 - addic. L, L, -1 - - ble LDGEMM_L4x16_LOOP_END - - - mtctr L - - MY_ALIGN - -LDGEMM_L4x16_LOOP: - - #dcbt AO, PRE - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_2 -2048,32, 15,1 - - - bdnz LDGEMM_L4x16_LOOP - - MY_ALIGN - MY_ALIGN -LDGEMM_L4x16_LOOP_END: - - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_3 -2048,32, 15,1 - b LDGEMM_L4x16_SUB1 - - - MY_ALIGN -LDGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - KERNEL4x16 1 - - addic. L, L, -1 - ble LDGEMM_L4x16_SAVE - b LDGEMM_L4x16_SUB2 - MY_ALIGN -LDGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - ble LDGEMM_L4x16_SAVE - MY_ALIGN -LDGEMM_L4x16_SUB2: - - andi. T1,L, 16 - ble LDGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_2 128,32, 3,0 - KERNEL4x16_I1_L2_2 128,32, 4,0 - KERNEL4x16_I1_L2_2 128,32, 5,0 - KERNEL4x16_I1_L2_2 128,32, 6,0 - KERNEL4x16_I1_L2_3 128,32, 7,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_8: - andi. T1,L, 8 - ble LDGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_3 128,32, 3,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_3 128,32, 1,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 128,32, 0,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LDGEMM_L4x16_SUB2 - - MY_ALIGN -LDGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LDGEMM_L4x16_BEGIN - -LDGEMM_L4x16_END: - -LDGEMM_L4x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L4x1_END - - andi. T1, M, 8 - ble LDGEMM_L4x8_END - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 - srawi. L, T3, 4 -#else - mr BO, B - srawi. L, K, 4 -#endif - - - ble LDGEMM_L4x8_SUB0 - -LDGEMM_L4x8_LOOP_START: - - - LOAD4x8_1 - ##OffsetA=64 OffsetB=32 - - - addic. L, L, -1 - - ble LDGEMM_L4x8_LOOP_END - - mtctr L - MY_ALIGN - -LDGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_2 64,32, 7,1 - - bdnz LDGEMM_L4x8_LOOP - MY_ALIGN -LDGEMM_L4x8_LOOP_END: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_3 64,32, 7,1 - - b LDGEMM_L4x8_SUB1 - MY_ALIGN -LDGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - KERNEL4x8 1 - - addic. L, L, -1 - ble LDGEMM_L4x8_SAVE - b LDGEMM_L4x8_SUB2 - MY_ALIGN -LDGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - ble LDGEMM_L4x8_SAVE - MY_ALIGN -LDGEMM_L4x8_SUB2: - - andi. T1,L, 8 - ble LDGEMM_L4x8_SUB2_4 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_3 64,32, 3,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_3 64,32, 1,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 64,32, 0,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x8_SAVE - KERNEL4x8 0 - - MY_ALIGN -LDGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 -#endif -LDGEMM_L4x8_END: - -LDGEMM_L4x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x4_SUB4 - -LDGEMM_L4x4_LOOP_START: - - #dcbt AO, PRE - LOAD4x4_1 - KERNEL4x4_I1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -2 - ble LDGEMM_L4x4_LOOP_END - - MY_ALIGN - -LDGEMM_L4x4_LOOP: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -1 - bgt LDGEMM_L4x4_LOOP - -LDGEMM_L4x4_LOOP_END: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_E2 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB4: - - KERNEL4x4_SUBI1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x4_SAVE - b LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x4_SAVE - -LDGEMM_L4x4_SUB2: - - KERNEL4x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SAVE: - - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 -#endif -LDGEMM_L4x4_END: - -LDGEMM_L4x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x2_SUB4 - -LDGEMM_L4x2_LOOP_START: - - LOAD4x2_1 - KERNEL4x2_I1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -2 - ble LDGEMM_L4x2_LOOP_END - - MY_ALIGN - -LDGEMM_L4x2_LOOP: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -1 - bgt LDGEMM_L4x2_LOOP - -LDGEMM_L4x2_LOOP_END: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_E2 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB4: - - KERNEL4x2_SUBI1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x2_SAVE - b LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x2_SAVE - -LDGEMM_L4x2_SUB2: - - KERNEL4x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SAVE: - - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 -#endif -LDGEMM_L4x2_END: - -LDGEMM_L4x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x1_SUB4 - -LDGEMM_L4x1_LOOP_START: - - LOAD4x1_1 - KERNEL4x1_I1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -2 - ble LDGEMM_L4x1_LOOP_END - - MY_ALIGN - -LDGEMM_L4x1_LOOP: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -1 - bgt LDGEMM_L4x1_LOOP - -LDGEMM_L4x1_LOOP_END: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_E2 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB4: - - KERNEL4x1_SUBI1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x1_SAVE - b LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x1_SAVE - -LDGEMM_L4x1_SUB2: - - KERNEL4x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SAVE: - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 -#endif -LDGEMM_L4x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - addic. J, J, -1 - bgt LDGEMM_L4_BEGIN - - andi. T2, N, 3 - ble .L999 - -LDGEMM_L4_END: - - b LDGEMM_L2_BEGIN - -.L999_H1: - - b .L999 - -LDGEMM_L2_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 2 - ble LDGEMM_L2_END - mr CO, C - mr AO, A - slwi T1, LDC , 1 - add C, C, T1 - srawi. I, M, 4 - ble LDGEMM_L2x16_END - -LDGEMM_L2x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x16_SUB4 - -LDGEMM_L2x16_LOOP_START: - - #dcbt AO, PRE - LOAD2x16_1 - #dcbt AO, PRE - KERNEL2x16_I1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -2 - ble LDGEMM_L2x16_LOOP_END - - MY_ALIGN - -LDGEMM_L2x16_LOOP: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -1 - bgt LDGEMM_L2x16_LOOP - -LDGEMM_L2x16_LOOP_END: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - KERNEL2x16_E2 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB4: - - #dcbt AO, PRE - KERNEL2x16_SUBI1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x16_SAVE - b LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x16_SAVE - -LDGEMM_L2x16_SUB2: - - KERNEL2x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SAVE: - - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt LDGEMM_L2x16_BEGIN - -LDGEMM_L2x16_END: - -LDGEMM_L2x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L2x1_END - - andi. T1, M, 8 - ble LDGEMM_L2x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x8_SUB4 - -LDGEMM_L2x8_LOOP_START: - - #dcbt AO, PRE - LOAD2x8_1 - KERNEL2x8_I1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -2 - ble LDGEMM_L2x8_LOOP_END - - MY_ALIGN - -LDGEMM_L2x8_LOOP: - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -1 - bgt LDGEMM_L2x8_LOOP - -LDGEMM_L2x8_LOOP_END: - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_2 - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_E2 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB4: - - KERNEL2x8_SUBI1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x8_SAVE - b LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x8_SAVE - -LDGEMM_L2x8_SUB2: - - KERNEL2x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SAVE: - - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 -#endif -LDGEMM_L2x8_END: - -LDGEMM_L2x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x4_SUB4 - -LDGEMM_L2x4_LOOP_START: - - LOAD2x4_1 - KERNEL2x4_I1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -2 - ble LDGEMM_L2x4_LOOP_END - - MY_ALIGN - -LDGEMM_L2x4_LOOP: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -1 - bgt LDGEMM_L2x4_LOOP - -LDGEMM_L2x4_LOOP_END: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_E2 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB4: - - KERNEL2x4_SUBI1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x4_SAVE - b LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x4_SAVE - -LDGEMM_L2x4_SUB2: - - KERNEL2x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SAVE: - - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 -#endif -LDGEMM_L2x4_END: - -LDGEMM_L2x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x2_SUB4 - -LDGEMM_L2x2_LOOP_START: - - LOAD2x2_1 - KERNEL2x2_I1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -2 - ble LDGEMM_L2x2_LOOP_END - - MY_ALIGN - -LDGEMM_L2x2_LOOP: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -1 - bgt LDGEMM_L2x2_LOOP - -LDGEMM_L2x2_LOOP_END: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_E2 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB4: - - KERNEL2x2_SUBI1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x2_SAVE - b LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x2_SAVE - -LDGEMM_L2x2_SUB2: - - KERNEL2x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SAVE: - - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 -#endif -LDGEMM_L2x2_END: - -LDGEMM_L2x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x1_SUB4 - -LDGEMM_L2x1_LOOP_START: - - LOAD2x1_1 - KERNEL2x1_I1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -2 - ble LDGEMM_L2x1_LOOP_END - - MY_ALIGN - -LDGEMM_L2x1_LOOP: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -1 - bgt LDGEMM_L2x1_LOOP - -LDGEMM_L2x1_LOOP_END: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_E2 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB4: - - KERNEL2x1_SUBI1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x1_SAVE - b LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x1_SAVE - -LDGEMM_L2x1_SUB2: - - KERNEL2x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SAVE: - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 -#endif -LDGEMM_L2x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LDGEMM_L2_END: -LDGEMM_L1_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 1 - ble LDGEMM_L1_END - mr CO, C - mr AO, A - srawi. I, M, 4 - ble LDGEMM_L1x16_END - -LDGEMM_L1x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x16_SUB4 - -LDGEMM_L1x16_LOOP_START: - - #dcbt AO, PRE - LOAD1x16_1 - #dcbt AO, PRE - KERNEL1x16_I1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -2 - ble LDGEMM_L1x16_LOOP_END - - MY_ALIGN - -LDGEMM_L1x16_LOOP: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -1 - bgt LDGEMM_L1x16_LOOP - -LDGEMM_L1x16_LOOP_END: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - KERNEL1x16_E2 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB4: - - #dcbt AO, PRE - KERNEL1x16_SUBI1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x16_SAVE - b LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x16_SAVE - -LDGEMM_L1x16_SUB2: - - KERNEL1x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SAVE: - - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt LDGEMM_L1x16_BEGIN - -LDGEMM_L1x16_END: - -LDGEMM_L1x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L1x1_END - - andi. T1, M, 8 - ble LDGEMM_L1x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x8_SUB4 - -LDGEMM_L1x8_LOOP_START: - - #dcbt AO, PRE - LOAD1x8_1 - KERNEL1x8_I1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -2 - ble LDGEMM_L1x8_LOOP_END - - MY_ALIGN - -LDGEMM_L1x8_LOOP: - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -1 - bgt LDGEMM_L1x8_LOOP - -LDGEMM_L1x8_LOOP_END: - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_2 - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_E2 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB4: - - KERNEL1x8_SUBI1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x8_SAVE - b LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x8_SAVE - -LDGEMM_L1x8_SUB2: - - KERNEL1x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SAVE: - - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 -#endif -LDGEMM_L1x8_END: - -LDGEMM_L1x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x4_SUB4 - -LDGEMM_L1x4_LOOP_START: - - LOAD1x4_1 - KERNEL1x4_I1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -2 - ble LDGEMM_L1x4_LOOP_END - - MY_ALIGN - -LDGEMM_L1x4_LOOP: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -1 - bgt LDGEMM_L1x4_LOOP - -LDGEMM_L1x4_LOOP_END: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_E2 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB4: - - KERNEL1x4_SUBI1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x4_SAVE - b LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x4_SAVE - -LDGEMM_L1x4_SUB2: - - KERNEL1x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SAVE: - - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 -#endif -LDGEMM_L1x4_END: - -LDGEMM_L1x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x2_SUB4 - -LDGEMM_L1x2_LOOP_START: - - LOAD1x2_1 - KERNEL1x2_I1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -2 - ble LDGEMM_L1x2_LOOP_END - - MY_ALIGN - -LDGEMM_L1x2_LOOP: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -1 - bgt LDGEMM_L1x2_LOOP - -LDGEMM_L1x2_LOOP_END: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_E2 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB4: - - KERNEL1x2_SUBI1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x2_SAVE - b LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x2_SAVE - -LDGEMM_L1x2_SUB2: - - KERNEL1x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SAVE: - - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 -#endif -LDGEMM_L1x2_END: - -LDGEMM_L1x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x1_SUB4 - -LDGEMM_L1x1_LOOP_START: - - LOAD1x1_1 - KERNEL1x1_I1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -2 - ble LDGEMM_L1x1_LOOP_END - - MY_ALIGN - -LDGEMM_L1x1_LOOP: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -1 - bgt LDGEMM_L1x1_LOOP - -LDGEMM_L1x1_LOOP_END: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_E2 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB4: - - KERNEL1x1_SUBI1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x1_SAVE - b LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x1_SAVE - -LDGEMM_L1x1_SUB2: - - KERNEL1x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SAVE: - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 -#endif -LDGEMM_L1x1_END: -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif -LDGEMM_L1_END: +/*************************************************************************** +Copyright (c) 2013-2019 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 2 + ble LDGEMM_L4_END + +LDGEMM_L4_BEGIN: + + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LDGEMM_L4x16_END + + MY_ALIGN +LDGEMM_L4x16_BEGIN: + + li L, -128 + + + SAVE4x16_REGS + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + + and T1, CO, L + and T2, C2, L + and T3, C3, L + and T4, C4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 + srawi. L, T3, 5 +#else + srawi. L, K, 5 +#endif + + ble LDGEMM_L4x16_SUB0 + + + MY_ALIGN +LDGEMM_L4x16_LOOP_START: + + li T2, 512 + + + LOAD4x16_1 + ##OffsetA=128 OffsetB=32 + addi AO,AO,2176 + # addi BO,BO,32 + addic. L, L, -1 + + ble LDGEMM_L4x16_LOOP_END + + + mtctr L + + MY_ALIGN + +LDGEMM_L4x16_LOOP: + + #dcbt AO, PRE + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_2 -2048,32, 15,1 + + + bdnz LDGEMM_L4x16_LOOP + + MY_ALIGN + MY_ALIGN +LDGEMM_L4x16_LOOP_END: + + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_3 -2048,32, 15,1 + b LDGEMM_L4x16_SUB1 + + + MY_ALIGN +LDGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + KERNEL4x16 1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 + MY_ALIGN +LDGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + ble LDGEMM_L4x16_SAVE + MY_ALIGN +LDGEMM_L4x16_SUB2: + + andi. T1,L, 16 + ble LDGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_2 128,32, 3,0 + KERNEL4x16_I1_L2_2 128,32, 4,0 + KERNEL4x16_I1_L2_2 128,32, 5,0 + KERNEL4x16_I1_L2_2 128,32, 6,0 + KERNEL4x16_I1_L2_3 128,32, 7,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_8: + andi. T1,L, 8 + ble LDGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_3 128,32, 3,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_3 128,32, 1,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 128,32, 0,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LDGEMM_L4x16_SUB2 + + MY_ALIGN +LDGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LDGEMM_L4x16_BEGIN + +LDGEMM_L4x16_END: + +LDGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L4x1_END + + andi. T1, M, 8 + ble LDGEMM_L4x8_END + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 + srawi. L, T3, 4 +#else + mr BO, B + srawi. L, K, 4 +#endif + + + ble LDGEMM_L4x8_SUB0 + +LDGEMM_L4x8_LOOP_START: + + + LOAD4x8_1 + ##OffsetA=64 OffsetB=32 + + + addic. L, L, -1 + + ble LDGEMM_L4x8_LOOP_END + + mtctr L + MY_ALIGN + +LDGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_2 64,32, 7,1 + + bdnz LDGEMM_L4x8_LOOP + MY_ALIGN +LDGEMM_L4x8_LOOP_END: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_3 64,32, 7,1 + + b LDGEMM_L4x8_SUB1 + MY_ALIGN +LDGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + KERNEL4x8 1 + + addic. L, L, -1 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 + MY_ALIGN +LDGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + ble LDGEMM_L4x8_SAVE + MY_ALIGN +LDGEMM_L4x8_SUB2: + + andi. T1,L, 8 + ble LDGEMM_L4x8_SUB2_4 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_3 64,32, 3,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_3 64,32, 1,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 64,32, 0,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x8_SAVE + KERNEL4x8 0 + + MY_ALIGN +LDGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 +#endif +LDGEMM_L4x8_END: + +LDGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x4_SUB4 + +LDGEMM_L4x4_LOOP_START: + + #dcbt AO, PRE + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -2 + ble LDGEMM_L4x4_LOOP_END + + MY_ALIGN + +LDGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -1 + bgt LDGEMM_L4x4_LOOP + +LDGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x4_SAVE + +LDGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SAVE: + + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 +#endif +LDGEMM_L4x4_END: + +LDGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x2_SUB4 + +LDGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble LDGEMM_L4x2_LOOP_END + + MY_ALIGN + +LDGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt LDGEMM_L4x2_LOOP + +LDGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x2_SAVE + +LDGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SAVE: + + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 +#endif +LDGEMM_L4x2_END: + +LDGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x1_SUB4 + +LDGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble LDGEMM_L4x1_LOOP_END + + MY_ALIGN + +LDGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt LDGEMM_L4x1_LOOP + +LDGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x1_SAVE + +LDGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SAVE: + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 +#endif +LDGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + addic. J, J, -1 + bgt LDGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999 + +LDGEMM_L4_END: + + b LDGEMM_L2_BEGIN + +.L999_H1: + + b .L999 + +LDGEMM_L2_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 2 + ble LDGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble LDGEMM_L2x16_END + +LDGEMM_L2x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x16_SUB4 + +LDGEMM_L2x16_LOOP_START: + + #dcbt AO, PRE + LOAD2x16_1 + #dcbt AO, PRE + KERNEL2x16_I1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble LDGEMM_L2x16_LOOP_END + + MY_ALIGN + +LDGEMM_L2x16_LOOP: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt LDGEMM_L2x16_LOOP + +LDGEMM_L2x16_LOOP_END: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB4: + + #dcbt AO, PRE + KERNEL2x16_SUBI1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x16_SAVE + +LDGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SAVE: + + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt LDGEMM_L2x16_BEGIN + +LDGEMM_L2x16_END: + +LDGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L2x1_END + + andi. T1, M, 8 + ble LDGEMM_L2x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x8_SUB4 + +LDGEMM_L2x8_LOOP_START: + + #dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble LDGEMM_L2x8_LOOP_END + + MY_ALIGN + +LDGEMM_L2x8_LOOP: + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt LDGEMM_L2x8_LOOP + +LDGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x8_SAVE + +LDGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SAVE: + + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 +#endif +LDGEMM_L2x8_END: + +LDGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x4_SUB4 + +LDGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble LDGEMM_L2x4_LOOP_END + + MY_ALIGN + +LDGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt LDGEMM_L2x4_LOOP + +LDGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x4_SAVE + +LDGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SAVE: + + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 +#endif +LDGEMM_L2x4_END: + +LDGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x2_SUB4 + +LDGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble LDGEMM_L2x2_LOOP_END + + MY_ALIGN + +LDGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt LDGEMM_L2x2_LOOP + +LDGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x2_SAVE + +LDGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SAVE: + + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 +#endif +LDGEMM_L2x2_END: + +LDGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x1_SUB4 + +LDGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble LDGEMM_L2x1_LOOP_END + + MY_ALIGN + +LDGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt LDGEMM_L2x1_LOOP + +LDGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x1_SAVE + +LDGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SAVE: + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 +#endif +LDGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 1 + ble LDGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble LDGEMM_L1x16_END + +LDGEMM_L1x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x16_SUB4 + +LDGEMM_L1x16_LOOP_START: + + #dcbt AO, PRE + LOAD1x16_1 + #dcbt AO, PRE + KERNEL1x16_I1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble LDGEMM_L1x16_LOOP_END + + MY_ALIGN + +LDGEMM_L1x16_LOOP: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt LDGEMM_L1x16_LOOP + +LDGEMM_L1x16_LOOP_END: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB4: + + #dcbt AO, PRE + KERNEL1x16_SUBI1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x16_SAVE + +LDGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SAVE: + + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt LDGEMM_L1x16_BEGIN + +LDGEMM_L1x16_END: + +LDGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L1x1_END + + andi. T1, M, 8 + ble LDGEMM_L1x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x8_SUB4 + +LDGEMM_L1x8_LOOP_START: + + #dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble LDGEMM_L1x8_LOOP_END + + MY_ALIGN + +LDGEMM_L1x8_LOOP: + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt LDGEMM_L1x8_LOOP + +LDGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x8_SAVE + +LDGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SAVE: + + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 +#endif +LDGEMM_L1x8_END: + +LDGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x4_SUB4 + +LDGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble LDGEMM_L1x4_LOOP_END + + MY_ALIGN + +LDGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt LDGEMM_L1x4_LOOP + +LDGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x4_SAVE + +LDGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SAVE: + + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 +#endif +LDGEMM_L1x4_END: + +LDGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x2_SUB4 + +LDGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble LDGEMM_L1x2_LOOP_END + + MY_ALIGN + +LDGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt LDGEMM_L1x2_LOOP + +LDGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x2_SAVE + +LDGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SAVE: + + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 +#endif +LDGEMM_L1x2_END: + +LDGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x1_SUB4 + +LDGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble LDGEMM_L1x1_LOOP_END + + MY_ALIGN + +LDGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt LDGEMM_L1x1_LOOP + +LDGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x1_SAVE + +LDGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SAVE: + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 +#endif +LDGEMM_L1x1_END: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S index c4b8270b8..4eddab24f 100644 --- a/kernel/power/dgemm_macros_power9.S +++ b/kernel/power/dgemm_macros_power9.S @@ -1,3623 +1,3623 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - -/********************************************************************* -* Macros for N=4, M=16 * -*********************************************************************/ -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - xxlxor vs36,vs36,vs36 - xxlxor vs37,vs37,vs37 - xxlxor vs38,vs38,vs38 - xxlxor vs39,vs39,vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif -.endm - - -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -.macro KERNEL4x16_L1_L2 Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete - -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 -.else - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 -.endif - lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) - lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - -.else - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 -.endif - lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - - - - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - - - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) -.endif - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) - lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 -.if \Complete==0 - lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) -.endif - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) -.endif - xvmaddadp vs52, vs12, vs30 - xvmaddadp vs53, vs13, vs30 - xvmaddadp vs54, vs14, vs30 - xvmaddadp vs55, vs15, vs30 -.if \Complete==0 - lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) -.endif - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 - - - xvmaddadp vs60, vs12, vs31 - - xvmaddadp vs61, vs13, vs31 - xvmaddadp vs62, vs14, vs31 - - xvmaddadp vs63, vs15, vs31 - .if \IsLast==1 - .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) - .else - addi \AREG, \AREG, DISP32(\Index,256) - addi \BREG, \BREG, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x16 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) - - - - addi BO, BO, 32 - addi AO, AO, 128 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 - -.endif -.endm - -.macro SAVE4x16_REGS - add C2, CO, LDC - add C3, C2, LDC - add C4, C3, LDC -.endm - -.macro SAVE4x16 -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs24, 64(CO) - lxv vs26, 80(CO) - lxv vs28, 96(CO) - lxv vs30, 112(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C2) - lxv vs3, 16(C2) - lxv vs5, 32(C2) - lxv vs7, 48(C2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs25, 64(C2) - lxv vs27, 80(C2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 -#ifndef TRMMKERNEL - lxv vs29, 96(C2) - lxv vs31, 112(C2) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - xxpermdi vs8, vs44,vs36,1 - xxpermdi vs9 ,vs36,vs44,1 - xxpermdi vs10, vs45,vs37,1 - xxpermdi vs11 ,vs37,vs45,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - xxpermdi vs12, vs46,vs38,1 - xxpermdi vs13 ,vs38,vs46,1 - xxpermdi vs14, vs47,vs39,1 - xxpermdi vs15 ,vs39,vs47,1 - -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r - -#endif - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - stxv vs24, 64(CO) - stxv vs26, 80(CO) - stxv vs28, 96(CO) - stxv vs30, 112(CO) - - stxv vs1, 0(C2) - stxv vs3, 16(C2) - stxv vs5, 32(C2) - stxv vs7, 48(C2) - - stxv vs25, 64(C2) - stxv vs27, 80(C2) - stxv vs29, 96(C2) - stxv vs31, 112(C2) -#ifndef TRMMKERNEL - lxv vs0, 0(C3) - lxv vs2, 16(C3) - lxv vs4, 32(C3) - lxv vs6, 48(C3) -#endif - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs24, 64(C3) - lxv vs26, 80(C3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs28, 96(C3) - lxv vs30, 112(C3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C4) - lxv vs3, 16(C4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(C4) - lxv vs7, 48(C4) - - lxv vs25, 64(C4) - lxv vs27, 80(C4) - lxv vs29, 96(C4) - lxv vs31, 112(C4) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - - xxpermdi vs8, vs60,vs52,1 - xxpermdi vs9 ,vs52,vs60,1 - xxpermdi vs10, vs61,vs53,1 - xxpermdi vs11 ,vs53,vs61,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - - - xxpermdi vs12, vs62,vs54,1 - xxpermdi vs13 ,vs54,vs62,1 - xxpermdi vs14, vs63,vs55,1 - xxpermdi vs15 ,vs55,vs63,1 -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r -#endif - stxv vs0, 0(C3) - stxv vs2, 16(C3) - stxv vs4, 32(C3) - stxv vs6, 48(C3) - - stxv vs24, 64(C3) - stxv vs26, 80(C3) - stxv vs28, 96(C3) - stxv vs30, 112(C3) - - stxv vs1, 0(C4) - stxv vs3, 16(C4) - stxv vs5, 32(C4) - stxv vs7, 48(C4) - - stxv vs25, 64(C4) - stxv vs27, 80(C4) - stxv vs29, 96(C4) - stxv vs31, 112(C4) - - addi CO, CO, 128 -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - -.endif -.endm - - - -.macro KERNEL4x8_L1_L2 Index,IsLast - KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index,0+\OffsetA)(AO) - lxv vs9, DISP16(\Index,16+\OffsetA)(AO) -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - - lxv vs10, DISP16(\Index,32+\OffsetA)(AO) - lxv vs11, DISP16(\Index,48+\OffsetA)(AO) - - - -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - - lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) - lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(AO) - lxv vs1, DISP16(\Index,80+\OffsetA)(AO) -.endif - - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.if \Complete==0 - lxv vs2, DISP16(\Index,96+\OffsetA)(AO) - lxv vs3, DISP16(\Index,112+\OffsetA)(AO) -.endif - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) - lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) -.endif - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - - .if \IsLast==1 - .if \Complete==1 - addi AO, AO, DISP16(\Index,64+\OffsetA) - addi BO, BO, DISP8(\Index,32+\OffsetB) - .else - addi AO, AO, DISP16(\Index,128) - addi BO, BO, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x8 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - - - addi BO, BO, 32 - addi AO, AO, 64 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - -.endif -.endm - - - -.macro SAVE4x8 - add T2, CO, LDC - add T3, T2, LDC - add T4, T3, LDC -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T2) - lxv vs3, 16(T2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T2) - lxv vs7, 48(T2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 - - - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - - stxv vs1, 0(T2) - stxv vs3, 16(T2) - stxv vs5, 32(T2) - stxv vs7, 48(T2) - - - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs0, 0(T3) - lxv vs2, 16(T3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs4, 32(T3) - lxv vs6, 48(T3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T4) - lxv vs3, 16(T4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T4) - lxv vs7, 48(T4) - - - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(T3) - stxv vs2, 16(T3) - stxv vs4, 32(T3) - stxv vs6, 48(T3) - - - stxv vs1, 0(T4) - stxv vs3, 16(T4) - stxv vs5, 32(T4) - stxv vs7, 48(T4) - - - - addi CO, CO, 64 -.endm - - -/********************************************************************* -* Macros for N=4, M=4 * -*********************************************************************/ - -.macro LOAD4x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro SAVE4x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=4, M=2 * -*********************************************************************/ - -.macro LOAD4x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r -#else - xvmuldp vs0, vs48, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r -#else - xvmuldp vs8, vs56, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=4, M=1 * -*********************************************************************/ - -.macro LOAD4x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs48, alpha_r -#else - xsmuldp vs0, vs48, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs56, alpha_r -#else - xsmuldp vs8, vs56, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=2, M=16 * -*********************************************************************/ - -.macro LOAD2x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL2x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro SAVE2x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD2x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro SAVE2x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=2, M=4 * -*********************************************************************/ - -.macro LOAD2x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro SAVE2x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=2, M=2 * -*********************************************************************/ - -.macro LOAD2x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=2, M=1 * -*********************************************************************/ - -.macro LOAD2x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=1, M=16 * -*********************************************************************/ - -.macro LOAD1x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL1x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro SAVE1x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD1x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro SAVE1x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=1, M=4 * -*********************************************************************/ - -.macro LOAD1x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=1, M=2 * -*********************************************************************/ - -.macro LOAD1x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_E2 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=1, M=1 * -*********************************************************************/ - -.macro LOAD1x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_E2 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - addi CO, CO, 8 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + xxlxor vs36,vs36,vs36 + xxlxor vs37,vs37,vs37 + xxlxor vs38,vs38,vs38 + xxlxor vs39,vs39,vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + + +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro KERNEL4x16_L1_L2 Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete + +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 +.else + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 +.endif + lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) + lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + +.else + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 +.endif + lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) +.endif + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) + lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 +.if \Complete==0 + lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) +.endif + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) +.endif + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 +.if \Complete==0 + lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) +.endif + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + + xvmaddadp vs60, vs12, vs31 + + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + + xvmaddadp vs63, vs15, vs31 + .if \IsLast==1 + .if \Complete==1 + addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) + .else + addi \AREG, \AREG, DISP32(\Index,256) + addi \BREG, \BREG, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x16 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) + + + + addi BO, BO, 32 + addi AO, AO, 128 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endif +.endm + +.macro SAVE4x16_REGS + add C2, CO, LDC + add C3, C2, LDC + add C4, C3, LDC +.endm + +.macro SAVE4x16 +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs24, 64(CO) + lxv vs26, 80(CO) + lxv vs28, 96(CO) + lxv vs30, 112(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C2) + lxv vs3, 16(C2) + lxv vs5, 32(C2) + lxv vs7, 48(C2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs25, 64(C2) + lxv vs27, 80(C2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 +#ifndef TRMMKERNEL + lxv vs29, 96(C2) + lxv vs31, 112(C2) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + xxpermdi vs8, vs44,vs36,1 + xxpermdi vs9 ,vs36,vs44,1 + xxpermdi vs10, vs45,vs37,1 + xxpermdi vs11 ,vs37,vs45,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + xxpermdi vs12, vs46,vs38,1 + xxpermdi vs13 ,vs38,vs46,1 + xxpermdi vs14, vs47,vs39,1 + xxpermdi vs15 ,vs39,vs47,1 + +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r + +#endif + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + stxv vs24, 64(CO) + stxv vs26, 80(CO) + stxv vs28, 96(CO) + stxv vs30, 112(CO) + + stxv vs1, 0(C2) + stxv vs3, 16(C2) + stxv vs5, 32(C2) + stxv vs7, 48(C2) + + stxv vs25, 64(C2) + stxv vs27, 80(C2) + stxv vs29, 96(C2) + stxv vs31, 112(C2) +#ifndef TRMMKERNEL + lxv vs0, 0(C3) + lxv vs2, 16(C3) + lxv vs4, 32(C3) + lxv vs6, 48(C3) +#endif + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs24, 64(C3) + lxv vs26, 80(C3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs28, 96(C3) + lxv vs30, 112(C3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C4) + lxv vs3, 16(C4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(C4) + lxv vs7, 48(C4) + + lxv vs25, 64(C4) + lxv vs27, 80(C4) + lxv vs29, 96(C4) + lxv vs31, 112(C4) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + + xxpermdi vs8, vs60,vs52,1 + xxpermdi vs9 ,vs52,vs60,1 + xxpermdi vs10, vs61,vs53,1 + xxpermdi vs11 ,vs53,vs61,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + + + xxpermdi vs12, vs62,vs54,1 + xxpermdi vs13 ,vs54,vs62,1 + xxpermdi vs14, vs63,vs55,1 + xxpermdi vs15 ,vs55,vs63,1 +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r +#endif + stxv vs0, 0(C3) + stxv vs2, 16(C3) + stxv vs4, 32(C3) + stxv vs6, 48(C3) + + stxv vs24, 64(C3) + stxv vs26, 80(C3) + stxv vs28, 96(C3) + stxv vs30, 112(C3) + + stxv vs1, 0(C4) + stxv vs3, 16(C4) + stxv vs5, 32(C4) + stxv vs7, 48(C4) + + stxv vs25, 64(C4) + stxv vs27, 80(C4) + stxv vs29, 96(C4) + stxv vs31, 112(C4) + + addi CO, CO, 128 +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + +.endif +.endm + + + +.macro KERNEL4x8_L1_L2 Index,IsLast + KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index,0+\OffsetA)(AO) + lxv vs9, DISP16(\Index,16+\OffsetA)(AO) +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + + lxv vs10, DISP16(\Index,32+\OffsetA)(AO) + lxv vs11, DISP16(\Index,48+\OffsetA)(AO) + + + +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + + lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) + lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(AO) + lxv vs1, DISP16(\Index,80+\OffsetA)(AO) +.endif + + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.if \Complete==0 + lxv vs2, DISP16(\Index,96+\OffsetA)(AO) + lxv vs3, DISP16(\Index,112+\OffsetA)(AO) +.endif + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) + lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) +.endif + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + + .if \IsLast==1 + .if \Complete==1 + addi AO, AO, DISP16(\Index,64+\OffsetA) + addi BO, BO, DISP8(\Index,32+\OffsetB) + .else + addi AO, AO, DISP16(\Index,128) + addi BO, BO, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x8 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + + + addi BO, BO, 32 + addi AO, AO, 64 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + +.endif +.endm + + + +.macro SAVE4x8 + add T2, CO, LDC + add T3, T2, LDC + add T4, T3, LDC +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T2) + lxv vs3, 16(T2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T2) + lxv vs7, 48(T2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 + + + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + + stxv vs1, 0(T2) + stxv vs3, 16(T2) + stxv vs5, 32(T2) + stxv vs7, 48(T2) + + + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs0, 0(T3) + lxv vs2, 16(T3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs4, 32(T3) + lxv vs6, 48(T3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T4) + lxv vs3, 16(T4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T4) + lxv vs7, 48(T4) + + + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(T3) + stxv vs2, 16(T3) + stxv vs4, 32(T3) + stxv vs6, 48(T3) + + + stxv vs1, 0(T4) + stxv vs3, 16(T4) + stxv vs5, 32(T4) + stxv vs7, 48(T4) + + + + addi CO, CO, 64 +.endm + + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index bd74d20e5..58dcdec5a 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -1,328 +1,328 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - -#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code - -#if !defined(USE_MASK_PERMUTATIONS) - -static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgew %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgow %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { - - BLASLONG index; - BLASLONG i=0; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned int static_index0 = {0,1,2,3}; -#else - register __vector unsigned int static_index0 = {2,0,3,1}; -#endif - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0; - register __vector unsigned int static_index2=static_index0 +temp1; - register __vector unsigned int static_index3=static_index1 +temp1; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - - register __vector float * v_ptrx=(__vector float *)x; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; -#endif - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vv0,vf0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(vv0,quadruple_values); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the maximum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - - - - - - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - max = ciamax_kernel_32(n1, x, &maxf); - i = n1; - ix = n1 << 1; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); - - } else { - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (max + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code + +#if !defined(USE_MASK_PERMUTATIONS) + +static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgew %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgow %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i=0; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned int static_index0 = {0,1,2,3}; +#else + register __vector unsigned int static_index0 = {2,0,3,1}; +#endif + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0; + register __vector unsigned int static_index2=static_index0 +temp1; + register __vector unsigned int static_index3=static_index1 +temp1; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; +#endif + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 336766245..843370c6c 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -1,266 +1,266 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - - - -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - float first_min=CABS1(x,0); - register __vector float quadruple_values={first_min,first_min,first_min,first_min}; - - register __vector float * v_ptrx=(__vector float *)x; - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vf0,vv0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(quadruple_values,vv0); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the minimum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = ciamin_kernel_32(n1, x, &minf); - i = n1; - ix = n1 << 1; - } - - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); - - } else { - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (min + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c index bf1af78d6..fb2dafec0 100644 --- a/kernel/power/isamax.c +++ b/kernel/power/isamax.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include - - -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - register __vector float * v_ptrx=(__vector float *)x; - for(; ii2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - max = siamax_kernel_64(n1, x, &maxf); - - i = n1; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c index 1c1f0ad78..60c843f58 100644 --- a/kernel/power/isamin.c +++ b/kernel/power/isamin.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; - register __vector float * v_ptrx=(__vector float *)x; - register __vector float quadruple_values=vec_abs(v_ptrx[0]); - for(; ii2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = siamin_kernel_64(n1, x, &minf); - i = n1; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index 7a0f3143e..5cdc83d87 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -1,272 +1,272 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs20 -#define save_permute_1 vs21 -#define save_permute_2 vs22 -#define permute_mask vs23 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define T11 r29 - -#define T12 r30 -#define T13 r31 - -#include "sgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_11, 0x1415161718191a1b -.equ save_permute_12, 0x0405060708090a0b -.equ save_permute_21, 0x101112131c1d1e1f -.equ save_permute_22, 0x000102030c0d0e0f - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - slwi LDC, LDC, 2 - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xxspltw alpha_r,alpha_r,0 - -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - lis T5, save_permute_22@highest - lis T6, save_permute_21@highest - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - ori T5, T5, save_permute_22@higher - ori T6, T6, save_permute_21@higher - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - rldicr T5, T5, 32, 31 - rldicr T6, T6, 32, 31 - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - oris T5, T5, save_permute_22@h - oris T6, T6, save_permute_21@h - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - ori T5, T5, save_permute_22@l - ori T6, T6, save_permute_21@l - li r0,0 - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - mtvsrdd save_permute_2,T5,T6 - -#include "sgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs20 +#define save_permute_1 vs21 +#define save_permute_2 vs22 +#define permute_mask vs23 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define T11 r29 + +#define T12 r30 +#define T13 r31 + +#include "sgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_11, 0x1415161718191a1b +.equ save_permute_12, 0x0405060708090a0b +.equ save_permute_21, 0x101112131c1d1e1f +.equ save_permute_22, 0x000102030c0d0e0f + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + slwi LDC, LDC, 2 + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 + +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + lis T5, save_permute_22@highest + lis T6, save_permute_21@highest + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + ori T5, T5, save_permute_22@higher + ori T6, T6, save_permute_21@higher + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + rldicr T5, T5, 32, 31 + rldicr T6, T6, 32, 31 + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + oris T5, T5, save_permute_22@h + oris T6, T6, save_permute_21@h + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + ori T5, T5, save_permute_22@l + ori T6, T6, save_permute_21@l + li r0,0 + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + mtvsrdd save_permute_2,T5,T6 + +#include "sgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index a34ed32b8..4022959e2 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -1,2192 +1,2192 @@ -#define MY_ALIGN .align 3 -b L8 - - MY_ALIGN -LSGEMM_L8x16_LMAIN_SUB: - LOAD8x16_2 - MY_ALIGN - -LSGEMM_L8x16_LOOP: - KERNEL8x16_L2 128,64,0,0 -LSGEMM_L8x16_K128: - KERNEL8x16_L2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64, 3,0 - KERNEL8x16_I1_L4_2 128,64, 4,0 - KERNEL8x16_I1_L4_2 128,64, 5,0 - KERNEL8x16_I1_L4_2 128,64, 6,0 - KERNEL8x16_I1_L4_2 128,64, 7,0 - KERNEL8x16_I1_L4_2 128,64, 8,0 - KERNEL8x16_I1_L4_2 128,64, 9,0 - KERNEL8x16_I1_L4_2 128,64, 10,0 - KERNEL8x16_I1_L4_2 128,64, 11,0 - KERNEL8x16_I1_L4_2 128,64, 12,0 - KERNEL8x16_I1_L4_2 128,64, 13,0 - KERNEL8x16_I1_L4_2 128,64, 14,0 - KERNEL8x16_I1_L4_2 128,64, 15,0 - KERNEL8x16_I1_L4_2 128,64, 16,0 - KERNEL8x16_I1_L4_2 128,64, 17,0 - KERNEL8x16_I1_L4_2 128,64, 18,0 - KERNEL8x16_I1_L4_2 128,64, 19,0 - KERNEL8x16_I1_L4_2 128,64, 20,0 - KERNEL8x16_I1_L4_2 128,64, 21,0 - KERNEL8x16_I1_L4_2 128,64, 22,0 - KERNEL8x16_I1_L4_2 128,64, 23,0 - KERNEL8x16_I1_L4_2 128,64, 24,0 - KERNEL8x16_I1_L4_2 128,64, 25,0 - KERNEL8x16_I1_L4_2 128,64, 26,0 - KERNEL8x16_I1_L4_2 128,64, 27,0 - KERNEL8x16_I1_L4_2 128,64, 28,0 - KERNEL8x16_I1_L4_2 128,64, 29,0 - KERNEL8x16_I1_L4_2 128,64, 30,0 - KERNEL8x16_I1_L4_2 128,64, 31,1 - bdnz LSGEMM_L8x16_LOOP - - MY_ALIGN -LSGEMM_L8x16_LOOP_END: - END8x16_2 - blr - - MY_ALIGN -LSGEMM_L8x16_L64_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_2 128,64,7,0 - KERNEL8x16_I1_L4_2 128,64,8,0 - KERNEL8x16_I1_L4_2 128,64,9,0 - KERNEL8x16_I1_L4_2 128,64,10,0 - KERNEL8x16_I1_L4_2 128,64,11,0 - KERNEL8x16_I1_L4_2 128,64,12,0 - KERNEL8x16_I1_L4_2 128,64,13,0 - KERNEL8x16_I1_L4_2 128,64,14,0 - KERNEL8x16_I1_L4_3 128,64,15,1 - blr -LSGEMM_L8x16_L32_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_3 128,64,7,1 - blr - -LSGEMM_L8x16_L16_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_3 128,64,3,1 - blr - -L8: -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 3 - - ble LSGEMM_L8_END - -LSGEMM_L8_BEGIN: - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 3 - add C, C, T3 - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L8x16_END - - MY_ALIGN -LSGEMM_L8x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 - mr T12, T11 - addi T12,T12, -2 - srawi. L, T12, 7 /**(T11-2) % 128x */ -#else - mr T12, K - addi T12,T12, -2 - srawi. L, T12, 7 /**(K-2) % 128x */ -#endif - - ZERO8x16 - ble LSGEMM_L8x16_SUB0 - mtctr L - bl LSGEMM_L8x16_LMAIN_SUB - andi. L, T12, 127 - ble LSGEMM_L8x16_SAVE - b LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 255 - cmpwi T11,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T10,1 - bne CMP8x16_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD8x16 64,32 - END8x16_WITHOUT_ADD - LOAD8x16_2O AO,BO, 128, 64 - mtctr T10 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE -CMP8x16_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T11,128 -#else - cmpwi K,128 -#endif - bne LSGEMM_L8x16_SUB2 - MY_ALIGN - mtctr T10 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD8x16_2O AO,BO, 128,64 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE - MY_ALIGN -LSGEMM_L8x16_SUB2: - andi. T10,L,64 - ble LSGEMM_L8x16_SUB2_32 - bl LSGEMM_L8x16_L64_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_32: - andi. T10,L, 32 - ble LSGEMM_L8x16_SUB2_16 - bl LSGEMM_L8x16_L32_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L8x16_SUB2_8 - bl LSGEMM_L8x16_L16_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L8x16_SUB2_4 - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_3 128,64, 1,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L8x16_SUB2_2 - LOAD8x16_2 - KERNEL8x16_I1_L4_3 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L8x16_SUB2_1 - LOAD8x16_2 - KERNEL8x16_E2 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L8x16_SAVE - KERNEL8x16 0 - - - MY_ALIGN -LSGEMM_L8x16_SAVE: - SAVE8x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L8x16_BEGIN - MY_ALIGN -LSGEMM_L8x16_END: -LSGEMM_L8x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 8 - ble LSGEMM_L8x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x8 - ble LSGEMM_L8x8_SUB0 - - MY_ALIGN -LSGEMM_L8x8_LOOP_START: - - LOAD8x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x8_LOOP: - - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_2 32,32, 1,0 - KERNEL8x8_I1_L4_2 32,32, 2,0 - KERNEL8x8_I1_L4_2 32,32, 3,1 - - bdnz LSGEMM_L8x8_LOOP - - MY_ALIGN -LSGEMM_L8x8_LOOP_END: - - END8x8 0, AO, BO, 32, 32 - - b LSGEMM_L8x8_SUB1 - MY_ALIGN -LSGEMM_L8x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x8_SUB2 - MY_ALIGN -LSGEMM_L8x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x8_SAVE - MY_ALIGN -LSGEMM_L8x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x8_SUB2_LOOP: - LOAD8x8_0 - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_3 32,32, 1,1 - bdnz LSGEMM_L8x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x8_SUB2_2 - LOAD8x8_0 - KERNEL8x8_I1_L4_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x8_SUB2_1 - LOAD8x8_0 - KERNEL8x8_I1_L2_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x8_SAVE - KERNEL8x8 0 - - - MY_ALIGN -LSGEMM_L8x8_SAVE: - SAVE8x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 -#endif - MY_ALIGN -LSGEMM_L8x8_END: -LSGEMM_L8x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 4 - ble LSGEMM_L8x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x4 - ble LSGEMM_L8x4_SUB0 - - MY_ALIGN -LSGEMM_L8x4_LOOP_START: - - LOAD8x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x4_LOOP: - - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_2 16,32, 1,0 - KERNEL8x4_I1_L4_2 16,32, 2,0 - KERNEL8x4_I1_L4_2 16,32, 3,1 - - bdnz LSGEMM_L8x4_LOOP - - MY_ALIGN -LSGEMM_L8x4_LOOP_END: - - END8x4 0, AO, BO, 16, 32 - - b LSGEMM_L8x4_SUB1 - MY_ALIGN -LSGEMM_L8x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x4_SUB2 - MY_ALIGN -LSGEMM_L8x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x4_SAVE - MY_ALIGN -LSGEMM_L8x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x4_SUB2_LOOP: - LOAD8x4_0 - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_3 16,32, 1,1 - bdnz LSGEMM_L8x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x4_SUB2_2 - LOAD8x4_0 - KERNEL8x4_I1_L4_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x4_SUB2_1 - LOAD8x4_0 - KERNEL8x4_I1_L2_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x4_SAVE - KERNEL8x4 0 - - - MY_ALIGN -LSGEMM_L8x4_SAVE: - SAVE8x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 -#endif - MY_ALIGN -LSGEMM_L8x4_END: -LSGEMM_L8x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L8x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x2 - ble LSGEMM_L8x2_SUB0 - - MY_ALIGN -LSGEMM_L8x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x2_LOOP: - - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,0 - KERNEL8x2_2 0,0, 2,0 - KERNEL8x2_2 0,0, 3,1 - - bdnz LSGEMM_L8x2_LOOP - - MY_ALIGN -LSGEMM_L8x2_LOOP_END: - -LSGEMM_L8x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x2_SAVE - MY_ALIGN -LSGEMM_L8x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x2_SUB2_2 - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x2_SUB2_1 - KERNEL8x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x2_SAVE - KERNEL8x2 - - MY_ALIGN -LSGEMM_L8x2_SAVE: - SAVE8x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 -#endif - MY_ALIGN -LSGEMM_L8x2_END: -LSGEMM_L8x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L8x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x1 - ble LSGEMM_L8x1_SUB0 - - MY_ALIGN -LSGEMM_L8x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x1_LOOP: - - KERNEL8x1_4 0,0, 0,0 - KERNEL8x1_4 0,0, 1,1 - - bdnz LSGEMM_L8x1_LOOP - - MY_ALIGN -LSGEMM_L8x1_LOOP_END: - -LSGEMM_L8x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x1_SAVE - MY_ALIGN -LSGEMM_L8x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x1_SUB2_2 - KERNEL8x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x1_SUB2_1 - KERNEL8x1_2 - MY_ALIGN -LSGEMM_L8x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x1_SAVE - KERNEL8x1 - - MY_ALIGN -LSGEMM_L8x1_SAVE: - SAVE8x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 -#endif - MY_ALIGN -LSGEMM_L8x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 8 -#endif - addic. J, J, -1 - bgt LSGEMM_L8_BEGIN - - -LSGEMM_L8_END: - -/* b LSGEMM_L4_BEGIN*/ - andi. T1, N, 4 - ble LSGEMM_L4_END -LSGEMM_L4_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L4x16_END - - MY_ALIGN -LSGEMM_L4x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 6 /**(T11-1) % 64x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 6 /**(K-1) % 64x */ -#endif - - ZERO4x16 - ble LSGEMM_L4x16_SUB0 - - MY_ALIGN -LSGEMM_L4x16_LOOP_START: - - LOAD4x16_0 /*we already zeroed */ - ##OffsetA=64 OffsetB=16 - addi AO,AO,2112 - addi BO,BO,16 - - mtctr L - - MY_ALIGN - -LSGEMM_L4x16_LOOP: - - KERNEL4x16_I1_L4_2 -2048,0, 0,0 - KERNEL4x16_I1_L4_2 -2048,0, 1,0 - KERNEL4x16_I1_L4_2 -2048,0, 2,0 - KERNEL4x16_I1_L4_2 -2048,0, 3,0 - KERNEL4x16_I1_L4_2 -2048,0, 4,0 - KERNEL4x16_I1_L4_2 -2048,0, 5,0 - KERNEL4x16_I1_L4_2 -2048,0, 6,0 - KERNEL4x16_I1_L4_2 -2048,0, 7,0 - KERNEL4x16_I1_L4_2 -2048,0, 8,0 - KERNEL4x16_I1_L4_2 -2048,0, 9,0 - KERNEL4x16_I1_L4_2 -2048,0, 10,0 - KERNEL4x16_I1_L4_2 -2048,0, 11,0 - KERNEL4x16_I1_L4_2 -2048,0, 12,0 - KERNEL4x16_I1_L4_2 -2048,0, 13,0 - KERNEL4x16_I1_L4_2 -2048,0, 14,0 - KERNEL4x16_I1_L4_2 -2048,0, 15,1 - - bdnz LSGEMM_L4x16_LOOP - - MY_ALIGN -LSGEMM_L4x16_LOOP_END: - - END4x16 0, AO, BO, -2048, 0 - - b LSGEMM_L4x16_SUB1 - MY_ALIGN -LSGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 127 -#else - andi. L, K, 127 -#endif - b LSGEMM_L4x16_SUB2 - MY_ALIGN -LSGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 63 -#else - andi. L, T12, 63 -#endif - ble LSGEMM_L4x16_SAVE - MY_ALIGN -LSGEMM_L4x16_SUB2: - - srawi. T10,L, 5 - ble LSGEMM_L4x16_SUB2_16 - mtctr T10 - MY_ALIGN -LSGEMM_L4x16_SUB2_LOOP: - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_2 64,16, 3,0 - KERNEL4x16_I1_L4_2 64,16, 4,0 - KERNEL4x16_I1_L4_2 64,16, 5,0 - KERNEL4x16_I1_L4_2 64,16, 6,0 - KERNEL4x16_I1_L4_3 64,16, 7,1 - bdnz LSGEMM_L4x16_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_3 64,16, 3,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_3 64,16, 1,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L4_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LSGEMM_L4x16_SUB2 - - MY_ALIGN -LSGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L4x16_BEGIN - MY_ALIGN -LSGEMM_L4x16_END: -LSGEMM_L4x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 8 - ble LSGEMM_L4x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x8 - ble LSGEMM_L4x8_SUB0 - - MY_ALIGN -LSGEMM_L4x8_LOOP_START: - - LOAD4x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_2 32,16, 1,0 - KERNEL4x8_I1_L4_2 32,16, 2,0 - KERNEL4x8_I1_L4_2 32,16, 3,1 - - bdnz LSGEMM_L4x8_LOOP - - MY_ALIGN -LSGEMM_L4x8_LOOP_END: - - END4x8 0, AO, BO, 32, 16 - - b LSGEMM_L4x8_SUB1 - MY_ALIGN -LSGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x8_SUB2 - MY_ALIGN -LSGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x8_SAVE - MY_ALIGN -LSGEMM_L4x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x8_SUB2_LOOP: - LOAD4x8_0 - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_3 32,16, 1,1 - bdnz LSGEMM_L4x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L4_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x8_SAVE - KERNEL4x8 0 - - - MY_ALIGN -LSGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 -#endif - MY_ALIGN -LSGEMM_L4x8_END: -LSGEMM_L4x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 4 - ble LSGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x4 - ble LSGEMM_L4x4_SUB0 - - MY_ALIGN -LSGEMM_L4x4_LOOP_START: - - LOAD4x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x4_LOOP: - - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_2 16,16, 1,0 - KERNEL4x4_I1_L4_2 16,16, 2,0 - KERNEL4x4_I1_L4_2 16,16, 3,1 - - bdnz LSGEMM_L4x4_LOOP - - MY_ALIGN -LSGEMM_L4x4_LOOP_END: - - END4x4 0, AO, BO, 16, 16 - - b LSGEMM_L4x4_SUB1 - MY_ALIGN -LSGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x4_SUB2 - MY_ALIGN -LSGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x4_SAVE - MY_ALIGN -LSGEMM_L4x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x4_SUB2_LOOP: - LOAD4x4_0 - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_3 16,16, 1,1 - bdnz LSGEMM_L4x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x4_SUB2_2 - LOAD4x4_0 - KERNEL4x4_I1_L4_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x4_SUB2_1 - LOAD4x4_0 - KERNEL4x4_I1_L2_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x4_SAVE - KERNEL4x4 0 - - - MY_ALIGN -LSGEMM_L4x4_SAVE: - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 -#endif - MY_ALIGN -LSGEMM_L4x4_END: -LSGEMM_L4x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L4x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x2 - ble LSGEMM_L4x2_SUB0 - - MY_ALIGN -LSGEMM_L4x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x2_LOOP: - - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,0 - KERNEL4x2_2 0,0, 2,0 - KERNEL4x2_2 0,0, 3,1 - - bdnz LSGEMM_L4x2_LOOP - - MY_ALIGN -LSGEMM_L4x2_LOOP_END: - -LSGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x2_SAVE - MY_ALIGN -LSGEMM_L4x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x2_SUB2_2 - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x2_SUB2_1 - KERNEL4x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -LSGEMM_L4x2_SAVE: - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 -#endif - MY_ALIGN -LSGEMM_L4x2_END: -LSGEMM_L4x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x1 - ble LSGEMM_L4x1_SUB0 - - MY_ALIGN -LSGEMM_L4x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x1_LOOP: - - KERNEL4x1_4 0,0, 0,0 - KERNEL4x1_4 0,0, 1,1 - - bdnz LSGEMM_L4x1_LOOP - - MY_ALIGN -LSGEMM_L4x1_LOOP_END: - -LSGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x1_SAVE - MY_ALIGN -LSGEMM_L4x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x1_SUB2_2 - KERNEL4x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x1_SUB2_1 - KERNEL4x1_2 - MY_ALIGN -LSGEMM_L4x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -LSGEMM_L4x1_SAVE: - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 -#endif - MY_ALIGN -LSGEMM_L4x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - - andi. T2, N, 3 - ble .L999 - -LSGEMM_L4_END: - andi. T1, N, 2 - ble LSGEMM_L2_END -LSGEMM_L2_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 1 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L2x16_END - - MY_ALIGN -LSGEMM_L2x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x16 - ble LSGEMM_L2x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x16_LOOP: - - KERNEL2x16_4 -2048,0, 0,0 - KERNEL2x16_4 -2048,0, 1,0 - KERNEL2x16_4 -2048,0, 2,0 - KERNEL2x16_4 -2048,0, 3,0 - KERNEL2x16_4 -2048,0, 4,0 - KERNEL2x16_4 -2048,0, 5,0 - KERNEL2x16_4 -2048,0, 6,0 - KERNEL2x16_4 -2048,0, 7,0 - KERNEL2x16_4 -2048,0, 8,0 - KERNEL2x16_4 -2048,0, 9,0 - KERNEL2x16_4 -2048,0, 10,0 - KERNEL2x16_4 -2048,0, 11,0 - KERNEL2x16_4 -2048,0, 12,0 - KERNEL2x16_4 -2048,0, 13,0 - KERNEL2x16_4 -2048,0, 14,0 - KERNEL2x16_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x16_SAVE - MY_ALIGN -LSGEMM_L2x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x16_SUB2_16 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,0 - KERNEL2x16_4 0,0, 4,0 - KERNEL2x16_4 0,0, 5,0 - KERNEL2x16_4 0,0, 6,0 - KERNEL2x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x16_SUB2_8 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x16_SUB2_4 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x16_SUB2_2 - KERNEL2x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x16_SUB2_1 - KERNEL2x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x16_SAVE - KERNEL2x16 - - MY_ALIGN -LSGEMM_L2x16_SAVE: - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L2x16_BEGIN - MY_ALIGN -LSGEMM_L2x16_END: - andi. I, M, 8 - ble LSGEMM_L2x8_END - - MY_ALIGN -LSGEMM_L2x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x8 - ble LSGEMM_L2x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x8_LOOP: - - KERNEL2x8_4 -2048,0, 0,0 - KERNEL2x8_4 -2048,0, 1,0 - KERNEL2x8_4 -2048,0, 2,0 - KERNEL2x8_4 -2048,0, 3,0 - KERNEL2x8_4 -2048,0, 4,0 - KERNEL2x8_4 -2048,0, 5,0 - KERNEL2x8_4 -2048,0, 6,0 - KERNEL2x8_4 -2048,0, 7,0 - KERNEL2x8_4 -2048,0, 8,0 - KERNEL2x8_4 -2048,0, 9,0 - KERNEL2x8_4 -2048,0, 10,0 - KERNEL2x8_4 -2048,0, 11,0 - KERNEL2x8_4 -2048,0, 12,0 - KERNEL2x8_4 -2048,0, 13,0 - KERNEL2x8_4 -2048,0, 14,0 - KERNEL2x8_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x8_SAVE - MY_ALIGN -LSGEMM_L2x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x8_SUB2_16 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,0 - KERNEL2x8_4 0,0, 4,0 - KERNEL2x8_4 0,0, 5,0 - KERNEL2x8_4 0,0, 6,0 - KERNEL2x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x8_SUB2_8 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x8_SUB2_4 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x8_SUB2_2 - KERNEL2x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x8_SUB2_1 - KERNEL2x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -LSGEMM_L2x8_SAVE: - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 -#endif - MY_ALIGN -LSGEMM_L2x8_END: - andi. I, M, 4 - ble LSGEMM_L2x4_END - - MY_ALIGN -LSGEMM_L2x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x4 - ble LSGEMM_L2x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x4_LOOP: - - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,0 - KERNEL2x4_4 0,0, 8,0 - KERNEL2x4_4 0,0, 9,0 - KERNEL2x4_4 0,0, 10,0 - KERNEL2x4_4 0,0, 11,0 - KERNEL2x4_4 0,0, 12,0 - KERNEL2x4_4 0,0, 13,0 - KERNEL2x4_4 0,0, 14,0 - KERNEL2x4_4 0,0, 15,1 - - bdnz LSGEMM_L2x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x4_SAVE - MY_ALIGN -LSGEMM_L2x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x4_SUB2_16 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x4_SUB2_8 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x4_SUB2_4 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x4_SUB2_2 - KERNEL2x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x4_SUB2_1 - KERNEL2x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x4_SAVE - KERNEL2x4 - - MY_ALIGN -LSGEMM_L2x4_SAVE: - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 -#endif - MY_ALIGN -LSGEMM_L2x4_END: - andi. I, M, 2 - ble LSGEMM_L2x2_END - - MY_ALIGN -LSGEMM_L2x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x2 - ble LSGEMM_L2x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x2_LOOP: - - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,0 - KERNEL2x2_4 0,0, 8,0 - KERNEL2x2_4 0,0, 9,0 - KERNEL2x2_4 0,0, 10,0 - KERNEL2x2_4 0,0, 11,0 - KERNEL2x2_4 0,0, 12,0 - KERNEL2x2_4 0,0, 13,0 - KERNEL2x2_4 0,0, 14,0 - KERNEL2x2_4 0,0, 15,1 - - bdnz LSGEMM_L2x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x2_SAVE - MY_ALIGN -LSGEMM_L2x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x2_SUB2_16 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x2_SUB2_8 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x2_SUB2_4 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x2_SUB2_2 - KERNEL2x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x2_SUB2_1 - KERNEL2x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -LSGEMM_L2x2_SAVE: - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 -#endif - MY_ALIGN -LSGEMM_L2x2_END: - andi. I, M, 1 - ble LSGEMM_L2x1_END - - MY_ALIGN -LSGEMM_L2x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x1 - ble LSGEMM_L2x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x1_LOOP: - - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,0 - KERNEL2x1_4 0,0, 8,0 - KERNEL2x1_4 0,0, 9,0 - KERNEL2x1_4 0,0, 10,0 - KERNEL2x1_4 0,0, 11,0 - KERNEL2x1_4 0,0, 12,0 - KERNEL2x1_4 0,0, 13,0 - KERNEL2x1_4 0,0, 14,0 - KERNEL2x1_4 0,0, 15,1 - - bdnz LSGEMM_L2x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x1_SAVE - MY_ALIGN -LSGEMM_L2x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x1_SUB2_16 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x1_SUB2_8 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x1_SUB2_4 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x1_SUB2_2 - KERNEL2x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x1_SUB2_1 - KERNEL2x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -LSGEMM_L2x1_SAVE: - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 -#endif - MY_ALIGN -LSGEMM_L2x1_END: - slwi T1, K, 3 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LSGEMM_L2_END: - andi. T1, N, 1 - ble LSGEMM_END -LSGEMM_1_BEGIN: - - - mr AO, A - mr CO, C - add C, C, LDC - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_1x16_END - - MY_ALIGN -LSGEMM_1x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x16 - ble LSGEMM_1x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x16_LOOP: - - KERNEL1x16_4 -2048,0, 0,0 - KERNEL1x16_4 -2048,0, 1,0 - KERNEL1x16_4 -2048,0, 2,0 - KERNEL1x16_4 -2048,0, 3,0 - KERNEL1x16_4 -2048,0, 4,0 - KERNEL1x16_4 -2048,0, 5,0 - KERNEL1x16_4 -2048,0, 6,0 - KERNEL1x16_4 -2048,0, 7,0 - KERNEL1x16_4 -2048,0, 8,0 - KERNEL1x16_4 -2048,0, 9,0 - KERNEL1x16_4 -2048,0, 10,0 - KERNEL1x16_4 -2048,0, 11,0 - KERNEL1x16_4 -2048,0, 12,0 - KERNEL1x16_4 -2048,0, 13,0 - KERNEL1x16_4 -2048,0, 14,0 - KERNEL1x16_4 -2048,0, 15,1 - - bdnz LSGEMM_1x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x16_SAVE - MY_ALIGN -LSGEMM_1x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x16_SUB2_16 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,0 - KERNEL1x16_4 0,0, 4,0 - KERNEL1x16_4 0,0, 5,0 - KERNEL1x16_4 0,0, 6,0 - KERNEL1x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x16_SUB2_8 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x16_SUB2_4 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x16_SUB2_2 - KERNEL1x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x16_SUB2_1 - KERNEL1x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x16_SAVE - KERNEL1x16 - - MY_ALIGN -LSGEMM_1x16_SAVE: - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt+ LSGEMM_1x16_BEGIN - MY_ALIGN -LSGEMM_1x16_END: - andi. I, M, 8 - ble LSGEMM_1x8_END - - MY_ALIGN -LSGEMM_1x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x8 - ble LSGEMM_1x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x8_LOOP: - - KERNEL1x8_4 -2048,0, 0,0 - KERNEL1x8_4 -2048,0, 1,0 - KERNEL1x8_4 -2048,0, 2,0 - KERNEL1x8_4 -2048,0, 3,0 - KERNEL1x8_4 -2048,0, 4,0 - KERNEL1x8_4 -2048,0, 5,0 - KERNEL1x8_4 -2048,0, 6,0 - KERNEL1x8_4 -2048,0, 7,0 - KERNEL1x8_4 -2048,0, 8,0 - KERNEL1x8_4 -2048,0, 9,0 - KERNEL1x8_4 -2048,0, 10,0 - KERNEL1x8_4 -2048,0, 11,0 - KERNEL1x8_4 -2048,0, 12,0 - KERNEL1x8_4 -2048,0, 13,0 - KERNEL1x8_4 -2048,0, 14,0 - KERNEL1x8_4 -2048,0, 15,1 - - bdnz LSGEMM_1x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x8_SAVE - MY_ALIGN -LSGEMM_1x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x8_SUB2_16 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,0 - KERNEL1x8_4 0,0, 4,0 - KERNEL1x8_4 0,0, 5,0 - KERNEL1x8_4 0,0, 6,0 - KERNEL1x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x8_SUB2_8 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x8_SUB2_4 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x8_SUB2_2 - KERNEL1x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x8_SUB2_1 - KERNEL1x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x8_SAVE - KERNEL1x8 - - MY_ALIGN -LSGEMM_1x8_SAVE: - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 -#endif - MY_ALIGN -LSGEMM_1x8_END: - andi. I, M, 4 - ble LSGEMM_1x4_END - - MY_ALIGN -LSGEMM_1x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x4 - ble LSGEMM_1x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x4_LOOP: - - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,0 - KERNEL1x4_4 0,0, 8,0 - KERNEL1x4_4 0,0, 9,0 - KERNEL1x4_4 0,0, 10,0 - KERNEL1x4_4 0,0, 11,0 - KERNEL1x4_4 0,0, 12,0 - KERNEL1x4_4 0,0, 13,0 - KERNEL1x4_4 0,0, 14,0 - KERNEL1x4_4 0,0, 15,1 - - bdnz LSGEMM_1x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x4_SAVE - MY_ALIGN -LSGEMM_1x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x4_SUB2_16 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x4_SUB2_8 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x4_SUB2_4 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x4_SUB2_2 - KERNEL1x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x4_SUB2_1 - KERNEL1x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x4_SAVE - KERNEL1x4 - - MY_ALIGN -LSGEMM_1x4_SAVE: - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 -#endif - MY_ALIGN -LSGEMM_1x4_END: - andi. I, M, 2 - ble LSGEMM_1x2_END - - MY_ALIGN -LSGEMM_1x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x2 - ble LSGEMM_1x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x2_LOOP: - - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,0 - KERNEL1x2_4 0,0, 8,0 - KERNEL1x2_4 0,0, 9,0 - KERNEL1x2_4 0,0, 10,0 - KERNEL1x2_4 0,0, 11,0 - KERNEL1x2_4 0,0, 12,0 - KERNEL1x2_4 0,0, 13,0 - KERNEL1x2_4 0,0, 14,0 - KERNEL1x2_4 0,0, 15,1 - - bdnz LSGEMM_1x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x2_SAVE - MY_ALIGN -LSGEMM_1x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x2_SUB2_16 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x2_SUB2_8 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x2_SUB2_4 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x2_SUB2_2 - KERNEL1x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x2_SUB2_1 - KERNEL1x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x2_SAVE - KERNEL1x2 - - MY_ALIGN -LSGEMM_1x2_SAVE: - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 -#endif - MY_ALIGN -LSGEMM_1x2_END: - andi. I, M, 1 - ble LSGEMM_1x1_END - - MY_ALIGN -LSGEMM_1x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x1 - ble LSGEMM_1x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x1_LOOP: - - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,0 - KERNEL1x1_16 0,0, 2,0 - KERNEL1x1_16 0,0, 3,1 - - bdnz LSGEMM_1x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x1_SAVE - MY_ALIGN -LSGEMM_1x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x1_SUB2_16 - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,1 - MY_ALIGN -LSGEMM_1x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x1_SUB2_8 - KERNEL1x1_16 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x1_SUB2_4 - KERNEL1x1_8 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x1_SUB2_2 - KERNEL1x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x1_SUB2_1 - KERNEL1x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x1_SAVE - KERNEL1x1 - - MY_ALIGN -LSGEMM_1x1_SAVE: - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 -#endif - MY_ALIGN -LSGEMM_1x1_END: - slwi T1, K, 2 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif +#define MY_ALIGN .align 3 +b L8 + + MY_ALIGN +LSGEMM_L8x16_LMAIN_SUB: + LOAD8x16_2 + MY_ALIGN + +LSGEMM_L8x16_LOOP: + KERNEL8x16_L2 128,64,0,0 +LSGEMM_L8x16_K128: + KERNEL8x16_L2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64, 3,0 + KERNEL8x16_I1_L4_2 128,64, 4,0 + KERNEL8x16_I1_L4_2 128,64, 5,0 + KERNEL8x16_I1_L4_2 128,64, 6,0 + KERNEL8x16_I1_L4_2 128,64, 7,0 + KERNEL8x16_I1_L4_2 128,64, 8,0 + KERNEL8x16_I1_L4_2 128,64, 9,0 + KERNEL8x16_I1_L4_2 128,64, 10,0 + KERNEL8x16_I1_L4_2 128,64, 11,0 + KERNEL8x16_I1_L4_2 128,64, 12,0 + KERNEL8x16_I1_L4_2 128,64, 13,0 + KERNEL8x16_I1_L4_2 128,64, 14,0 + KERNEL8x16_I1_L4_2 128,64, 15,0 + KERNEL8x16_I1_L4_2 128,64, 16,0 + KERNEL8x16_I1_L4_2 128,64, 17,0 + KERNEL8x16_I1_L4_2 128,64, 18,0 + KERNEL8x16_I1_L4_2 128,64, 19,0 + KERNEL8x16_I1_L4_2 128,64, 20,0 + KERNEL8x16_I1_L4_2 128,64, 21,0 + KERNEL8x16_I1_L4_2 128,64, 22,0 + KERNEL8x16_I1_L4_2 128,64, 23,0 + KERNEL8x16_I1_L4_2 128,64, 24,0 + KERNEL8x16_I1_L4_2 128,64, 25,0 + KERNEL8x16_I1_L4_2 128,64, 26,0 + KERNEL8x16_I1_L4_2 128,64, 27,0 + KERNEL8x16_I1_L4_2 128,64, 28,0 + KERNEL8x16_I1_L4_2 128,64, 29,0 + KERNEL8x16_I1_L4_2 128,64, 30,0 + KERNEL8x16_I1_L4_2 128,64, 31,1 + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + END8x16_2 + blr + + MY_ALIGN +LSGEMM_L8x16_L64_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_2 128,64,7,0 + KERNEL8x16_I1_L4_2 128,64,8,0 + KERNEL8x16_I1_L4_2 128,64,9,0 + KERNEL8x16_I1_L4_2 128,64,10,0 + KERNEL8x16_I1_L4_2 128,64,11,0 + KERNEL8x16_I1_L4_2 128,64,12,0 + KERNEL8x16_I1_L4_2 128,64,13,0 + KERNEL8x16_I1_L4_2 128,64,14,0 + KERNEL8x16_I1_L4_3 128,64,15,1 + blr +LSGEMM_L8x16_L32_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_3 128,64,7,1 + blr + +LSGEMM_L8x16_L16_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_3 128,64,3,1 + blr + +L8: +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 3 + + ble LSGEMM_L8_END + +LSGEMM_L8_BEGIN: + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 3 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L8x16_END + + MY_ALIGN +LSGEMM_L8x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 + mr T12, T11 + addi T12,T12, -2 + srawi. L, T12, 7 /**(T11-2) % 128x */ +#else + mr T12, K + addi T12,T12, -2 + srawi. L, T12, 7 /**(K-2) % 128x */ +#endif + + ZERO8x16 + ble LSGEMM_L8x16_SUB0 + mtctr L + bl LSGEMM_L8x16_LMAIN_SUB + andi. L, T12, 127 + ble LSGEMM_L8x16_SAVE + b LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 255 + cmpwi T11,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T10,1 + bne CMP8x16_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD8x16 64,32 + END8x16_WITHOUT_ADD + LOAD8x16_2O AO,BO, 128, 64 + mtctr T10 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE +CMP8x16_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T11,128 +#else + cmpwi K,128 +#endif + bne LSGEMM_L8x16_SUB2 + MY_ALIGN + mtctr T10 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD8x16_2O AO,BO, 128,64 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE + MY_ALIGN +LSGEMM_L8x16_SUB2: + andi. T10,L,64 + ble LSGEMM_L8x16_SUB2_32 + bl LSGEMM_L8x16_L64_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_32: + andi. T10,L, 32 + ble LSGEMM_L8x16_SUB2_16 + bl LSGEMM_L8x16_L32_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L8x16_SUB2_8 + bl LSGEMM_L8x16_L16_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L8x16_SUB2_4 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_3 128,64, 1,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L8x16_SUB2_2 + LOAD8x16_2 + KERNEL8x16_I1_L4_3 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L8x16_SUB2_1 + LOAD8x16_2 + KERNEL8x16_E2 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L8x16_SAVE + KERNEL8x16 0 + + + MY_ALIGN +LSGEMM_L8x16_SAVE: + SAVE8x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L8x16_BEGIN + MY_ALIGN +LSGEMM_L8x16_END: +LSGEMM_L8x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 8 + ble LSGEMM_L8x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x8 + ble LSGEMM_L8x8_SUB0 + + MY_ALIGN +LSGEMM_L8x8_LOOP_START: + + LOAD8x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x8_LOOP: + + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_2 32,32, 1,0 + KERNEL8x8_I1_L4_2 32,32, 2,0 + KERNEL8x8_I1_L4_2 32,32, 3,1 + + bdnz LSGEMM_L8x8_LOOP + + MY_ALIGN +LSGEMM_L8x8_LOOP_END: + + END8x8 0, AO, BO, 32, 32 + + b LSGEMM_L8x8_SUB1 + MY_ALIGN +LSGEMM_L8x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x8_SUB2 + MY_ALIGN +LSGEMM_L8x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x8_SAVE + MY_ALIGN +LSGEMM_L8x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x8_SUB2_LOOP: + LOAD8x8_0 + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_3 32,32, 1,1 + bdnz LSGEMM_L8x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x8_SUB2_2 + LOAD8x8_0 + KERNEL8x8_I1_L4_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x8_SUB2_1 + LOAD8x8_0 + KERNEL8x8_I1_L2_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x8_SAVE + KERNEL8x8 0 + + + MY_ALIGN +LSGEMM_L8x8_SAVE: + SAVE8x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 +#endif + MY_ALIGN +LSGEMM_L8x8_END: +LSGEMM_L8x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 4 + ble LSGEMM_L8x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x4 + ble LSGEMM_L8x4_SUB0 + + MY_ALIGN +LSGEMM_L8x4_LOOP_START: + + LOAD8x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x4_LOOP: + + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_2 16,32, 1,0 + KERNEL8x4_I1_L4_2 16,32, 2,0 + KERNEL8x4_I1_L4_2 16,32, 3,1 + + bdnz LSGEMM_L8x4_LOOP + + MY_ALIGN +LSGEMM_L8x4_LOOP_END: + + END8x4 0, AO, BO, 16, 32 + + b LSGEMM_L8x4_SUB1 + MY_ALIGN +LSGEMM_L8x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x4_SUB2 + MY_ALIGN +LSGEMM_L8x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x4_SAVE + MY_ALIGN +LSGEMM_L8x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x4_SUB2_LOOP: + LOAD8x4_0 + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_3 16,32, 1,1 + bdnz LSGEMM_L8x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x4_SUB2_2 + LOAD8x4_0 + KERNEL8x4_I1_L4_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x4_SUB2_1 + LOAD8x4_0 + KERNEL8x4_I1_L2_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x4_SAVE + KERNEL8x4 0 + + + MY_ALIGN +LSGEMM_L8x4_SAVE: + SAVE8x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 +#endif + MY_ALIGN +LSGEMM_L8x4_END: +LSGEMM_L8x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L8x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x2 + ble LSGEMM_L8x2_SUB0 + + MY_ALIGN +LSGEMM_L8x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x2_LOOP: + + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,0 + KERNEL8x2_2 0,0, 2,0 + KERNEL8x2_2 0,0, 3,1 + + bdnz LSGEMM_L8x2_LOOP + + MY_ALIGN +LSGEMM_L8x2_LOOP_END: + +LSGEMM_L8x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x2_SAVE + MY_ALIGN +LSGEMM_L8x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x2_SUB2_2 + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x2_SUB2_1 + KERNEL8x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x2_SAVE + KERNEL8x2 + + MY_ALIGN +LSGEMM_L8x2_SAVE: + SAVE8x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 +#endif + MY_ALIGN +LSGEMM_L8x2_END: +LSGEMM_L8x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L8x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x1 + ble LSGEMM_L8x1_SUB0 + + MY_ALIGN +LSGEMM_L8x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x1_LOOP: + + KERNEL8x1_4 0,0, 0,0 + KERNEL8x1_4 0,0, 1,1 + + bdnz LSGEMM_L8x1_LOOP + + MY_ALIGN +LSGEMM_L8x1_LOOP_END: + +LSGEMM_L8x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x1_SAVE + MY_ALIGN +LSGEMM_L8x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x1_SUB2_2 + KERNEL8x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x1_SUB2_1 + KERNEL8x1_2 + MY_ALIGN +LSGEMM_L8x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x1_SAVE + KERNEL8x1 + + MY_ALIGN +LSGEMM_L8x1_SAVE: + SAVE8x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 +#endif + MY_ALIGN +LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 8 +#endif + addic. J, J, -1 + bgt LSGEMM_L8_BEGIN + + +LSGEMM_L8_END: + +/* b LSGEMM_L4_BEGIN*/ + andi. T1, N, 4 + ble LSGEMM_L4_END +LSGEMM_L4_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L4x16_END + + MY_ALIGN +LSGEMM_L4x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO4x16 + ble LSGEMM_L4x16_SUB0 + + MY_ALIGN +LSGEMM_L4x16_LOOP_START: + + LOAD4x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=16 + addi AO,AO,2112 + addi BO,BO,16 + + mtctr L + + MY_ALIGN + +LSGEMM_L4x16_LOOP: + + KERNEL4x16_I1_L4_2 -2048,0, 0,0 + KERNEL4x16_I1_L4_2 -2048,0, 1,0 + KERNEL4x16_I1_L4_2 -2048,0, 2,0 + KERNEL4x16_I1_L4_2 -2048,0, 3,0 + KERNEL4x16_I1_L4_2 -2048,0, 4,0 + KERNEL4x16_I1_L4_2 -2048,0, 5,0 + KERNEL4x16_I1_L4_2 -2048,0, 6,0 + KERNEL4x16_I1_L4_2 -2048,0, 7,0 + KERNEL4x16_I1_L4_2 -2048,0, 8,0 + KERNEL4x16_I1_L4_2 -2048,0, 9,0 + KERNEL4x16_I1_L4_2 -2048,0, 10,0 + KERNEL4x16_I1_L4_2 -2048,0, 11,0 + KERNEL4x16_I1_L4_2 -2048,0, 12,0 + KERNEL4x16_I1_L4_2 -2048,0, 13,0 + KERNEL4x16_I1_L4_2 -2048,0, 14,0 + KERNEL4x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L4x16_LOOP + + MY_ALIGN +LSGEMM_L4x16_LOOP_END: + + END4x16 0, AO, BO, -2048, 0 + + b LSGEMM_L4x16_SUB1 + MY_ALIGN +LSGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L4x16_SUB2 + MY_ALIGN +LSGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L4x16_SAVE + MY_ALIGN +LSGEMM_L4x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L4x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L4x16_SUB2_LOOP: + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_2 64,16, 3,0 + KERNEL4x16_I1_L4_2 64,16, 4,0 + KERNEL4x16_I1_L4_2 64,16, 5,0 + KERNEL4x16_I1_L4_2 64,16, 6,0 + KERNEL4x16_I1_L4_3 64,16, 7,1 + bdnz LSGEMM_L4x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_3 64,16, 3,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_3 64,16, 1,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L4_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L4x16_SUB2 + + MY_ALIGN +LSGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L4x16_BEGIN + MY_ALIGN +LSGEMM_L4x16_END: +LSGEMM_L4x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 8 + ble LSGEMM_L4x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x8 + ble LSGEMM_L4x8_SUB0 + + MY_ALIGN +LSGEMM_L4x8_LOOP_START: + + LOAD4x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_2 32,16, 1,0 + KERNEL4x8_I1_L4_2 32,16, 2,0 + KERNEL4x8_I1_L4_2 32,16, 3,1 + + bdnz LSGEMM_L4x8_LOOP + + MY_ALIGN +LSGEMM_L4x8_LOOP_END: + + END4x8 0, AO, BO, 32, 16 + + b LSGEMM_L4x8_SUB1 + MY_ALIGN +LSGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x8_SUB2 + MY_ALIGN +LSGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x8_SAVE + MY_ALIGN +LSGEMM_L4x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x8_SUB2_LOOP: + LOAD4x8_0 + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_3 32,16, 1,1 + bdnz LSGEMM_L4x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L4_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x8_SAVE + KERNEL4x8 0 + + + MY_ALIGN +LSGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 +#endif + MY_ALIGN +LSGEMM_L4x8_END: +LSGEMM_L4x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 4 + ble LSGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x4 + ble LSGEMM_L4x4_SUB0 + + MY_ALIGN +LSGEMM_L4x4_LOOP_START: + + LOAD4x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x4_LOOP: + + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_2 16,16, 1,0 + KERNEL4x4_I1_L4_2 16,16, 2,0 + KERNEL4x4_I1_L4_2 16,16, 3,1 + + bdnz LSGEMM_L4x4_LOOP + + MY_ALIGN +LSGEMM_L4x4_LOOP_END: + + END4x4 0, AO, BO, 16, 16 + + b LSGEMM_L4x4_SUB1 + MY_ALIGN +LSGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x4_SUB2 + MY_ALIGN +LSGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x4_SAVE + MY_ALIGN +LSGEMM_L4x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x4_SUB2_LOOP: + LOAD4x4_0 + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_3 16,16, 1,1 + bdnz LSGEMM_L4x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x4_SUB2_2 + LOAD4x4_0 + KERNEL4x4_I1_L4_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x4_SUB2_1 + LOAD4x4_0 + KERNEL4x4_I1_L2_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x4_SAVE + KERNEL4x4 0 + + + MY_ALIGN +LSGEMM_L4x4_SAVE: + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 +#endif + MY_ALIGN +LSGEMM_L4x4_END: +LSGEMM_L4x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L4x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x2 + ble LSGEMM_L4x2_SUB0 + + MY_ALIGN +LSGEMM_L4x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x2_LOOP: + + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,0 + KERNEL4x2_2 0,0, 2,0 + KERNEL4x2_2 0,0, 3,1 + + bdnz LSGEMM_L4x2_LOOP + + MY_ALIGN +LSGEMM_L4x2_LOOP_END: + +LSGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x2_SAVE + MY_ALIGN +LSGEMM_L4x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x2_SUB2_2 + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x2_SUB2_1 + KERNEL4x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +LSGEMM_L4x2_SAVE: + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 +#endif + MY_ALIGN +LSGEMM_L4x2_END: +LSGEMM_L4x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x1 + ble LSGEMM_L4x1_SUB0 + + MY_ALIGN +LSGEMM_L4x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x1_LOOP: + + KERNEL4x1_4 0,0, 0,0 + KERNEL4x1_4 0,0, 1,1 + + bdnz LSGEMM_L4x1_LOOP + + MY_ALIGN +LSGEMM_L4x1_LOOP_END: + +LSGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x1_SAVE + MY_ALIGN +LSGEMM_L4x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x1_SUB2_2 + KERNEL4x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x1_SUB2_1 + KERNEL4x1_2 + MY_ALIGN +LSGEMM_L4x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +LSGEMM_L4x1_SAVE: + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 +#endif + MY_ALIGN +LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + + andi. T2, N, 3 + ble .L999 + +LSGEMM_L4_END: + andi. T1, N, 2 + ble LSGEMM_L2_END +LSGEMM_L2_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 1 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L2x16_END + + MY_ALIGN +LSGEMM_L2x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x16 + ble LSGEMM_L2x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x16_LOOP: + + KERNEL2x16_4 -2048,0, 0,0 + KERNEL2x16_4 -2048,0, 1,0 + KERNEL2x16_4 -2048,0, 2,0 + KERNEL2x16_4 -2048,0, 3,0 + KERNEL2x16_4 -2048,0, 4,0 + KERNEL2x16_4 -2048,0, 5,0 + KERNEL2x16_4 -2048,0, 6,0 + KERNEL2x16_4 -2048,0, 7,0 + KERNEL2x16_4 -2048,0, 8,0 + KERNEL2x16_4 -2048,0, 9,0 + KERNEL2x16_4 -2048,0, 10,0 + KERNEL2x16_4 -2048,0, 11,0 + KERNEL2x16_4 -2048,0, 12,0 + KERNEL2x16_4 -2048,0, 13,0 + KERNEL2x16_4 -2048,0, 14,0 + KERNEL2x16_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x16_SAVE + MY_ALIGN +LSGEMM_L2x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x16_SUB2_16 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,0 + KERNEL2x16_4 0,0, 4,0 + KERNEL2x16_4 0,0, 5,0 + KERNEL2x16_4 0,0, 6,0 + KERNEL2x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x16_SUB2_8 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x16_SUB2_4 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x16_SUB2_2 + KERNEL2x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x16_SUB2_1 + KERNEL2x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x16_SAVE + KERNEL2x16 + + MY_ALIGN +LSGEMM_L2x16_SAVE: + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L2x16_BEGIN + MY_ALIGN +LSGEMM_L2x16_END: + andi. I, M, 8 + ble LSGEMM_L2x8_END + + MY_ALIGN +LSGEMM_L2x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x8 + ble LSGEMM_L2x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x8_LOOP: + + KERNEL2x8_4 -2048,0, 0,0 + KERNEL2x8_4 -2048,0, 1,0 + KERNEL2x8_4 -2048,0, 2,0 + KERNEL2x8_4 -2048,0, 3,0 + KERNEL2x8_4 -2048,0, 4,0 + KERNEL2x8_4 -2048,0, 5,0 + KERNEL2x8_4 -2048,0, 6,0 + KERNEL2x8_4 -2048,0, 7,0 + KERNEL2x8_4 -2048,0, 8,0 + KERNEL2x8_4 -2048,0, 9,0 + KERNEL2x8_4 -2048,0, 10,0 + KERNEL2x8_4 -2048,0, 11,0 + KERNEL2x8_4 -2048,0, 12,0 + KERNEL2x8_4 -2048,0, 13,0 + KERNEL2x8_4 -2048,0, 14,0 + KERNEL2x8_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x8_SAVE + MY_ALIGN +LSGEMM_L2x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x8_SUB2_16 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,0 + KERNEL2x8_4 0,0, 4,0 + KERNEL2x8_4 0,0, 5,0 + KERNEL2x8_4 0,0, 6,0 + KERNEL2x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x8_SUB2_8 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x8_SUB2_4 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x8_SUB2_2 + KERNEL2x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +LSGEMM_L2x8_SAVE: + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 +#endif + MY_ALIGN +LSGEMM_L2x8_END: + andi. I, M, 4 + ble LSGEMM_L2x4_END + + MY_ALIGN +LSGEMM_L2x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x4 + ble LSGEMM_L2x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x4_LOOP: + + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,0 + KERNEL2x4_4 0,0, 8,0 + KERNEL2x4_4 0,0, 9,0 + KERNEL2x4_4 0,0, 10,0 + KERNEL2x4_4 0,0, 11,0 + KERNEL2x4_4 0,0, 12,0 + KERNEL2x4_4 0,0, 13,0 + KERNEL2x4_4 0,0, 14,0 + KERNEL2x4_4 0,0, 15,1 + + bdnz LSGEMM_L2x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x4_SAVE + MY_ALIGN +LSGEMM_L2x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x4_SUB2_16 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x4_SUB2_8 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x4_SUB2_4 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x4_SUB2_2 + KERNEL2x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x4_SAVE + KERNEL2x4 + + MY_ALIGN +LSGEMM_L2x4_SAVE: + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 +#endif + MY_ALIGN +LSGEMM_L2x4_END: + andi. I, M, 2 + ble LSGEMM_L2x2_END + + MY_ALIGN +LSGEMM_L2x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x2 + ble LSGEMM_L2x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x2_LOOP: + + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,0 + KERNEL2x2_4 0,0, 8,0 + KERNEL2x2_4 0,0, 9,0 + KERNEL2x2_4 0,0, 10,0 + KERNEL2x2_4 0,0, 11,0 + KERNEL2x2_4 0,0, 12,0 + KERNEL2x2_4 0,0, 13,0 + KERNEL2x2_4 0,0, 14,0 + KERNEL2x2_4 0,0, 15,1 + + bdnz LSGEMM_L2x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x2_SAVE + MY_ALIGN +LSGEMM_L2x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x2_SUB2_16 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x2_SUB2_8 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x2_SUB2_4 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x2_SUB2_2 + KERNEL2x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +LSGEMM_L2x2_SAVE: + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 +#endif + MY_ALIGN +LSGEMM_L2x2_END: + andi. I, M, 1 + ble LSGEMM_L2x1_END + + MY_ALIGN +LSGEMM_L2x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x1 + ble LSGEMM_L2x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x1_LOOP: + + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,0 + KERNEL2x1_4 0,0, 8,0 + KERNEL2x1_4 0,0, 9,0 + KERNEL2x1_4 0,0, 10,0 + KERNEL2x1_4 0,0, 11,0 + KERNEL2x1_4 0,0, 12,0 + KERNEL2x1_4 0,0, 13,0 + KERNEL2x1_4 0,0, 14,0 + KERNEL2x1_4 0,0, 15,1 + + bdnz LSGEMM_L2x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x1_SAVE + MY_ALIGN +LSGEMM_L2x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x1_SUB2_16 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x1_SUB2_8 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x1_SUB2_4 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x1_SUB2_2 + KERNEL2x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x1_SUB2_1 + KERNEL2x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +LSGEMM_L2x1_SAVE: + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 +#endif + MY_ALIGN +LSGEMM_L2x1_END: + slwi T1, K, 3 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LSGEMM_L2_END: + andi. T1, N, 1 + ble LSGEMM_END +LSGEMM_1_BEGIN: + + + mr AO, A + mr CO, C + add C, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_1x16_END + + MY_ALIGN +LSGEMM_1x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x16 + ble LSGEMM_1x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x16_LOOP: + + KERNEL1x16_4 -2048,0, 0,0 + KERNEL1x16_4 -2048,0, 1,0 + KERNEL1x16_4 -2048,0, 2,0 + KERNEL1x16_4 -2048,0, 3,0 + KERNEL1x16_4 -2048,0, 4,0 + KERNEL1x16_4 -2048,0, 5,0 + KERNEL1x16_4 -2048,0, 6,0 + KERNEL1x16_4 -2048,0, 7,0 + KERNEL1x16_4 -2048,0, 8,0 + KERNEL1x16_4 -2048,0, 9,0 + KERNEL1x16_4 -2048,0, 10,0 + KERNEL1x16_4 -2048,0, 11,0 + KERNEL1x16_4 -2048,0, 12,0 + KERNEL1x16_4 -2048,0, 13,0 + KERNEL1x16_4 -2048,0, 14,0 + KERNEL1x16_4 -2048,0, 15,1 + + bdnz LSGEMM_1x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x16_SAVE + MY_ALIGN +LSGEMM_1x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x16_SUB2_16 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,0 + KERNEL1x16_4 0,0, 4,0 + KERNEL1x16_4 0,0, 5,0 + KERNEL1x16_4 0,0, 6,0 + KERNEL1x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x16_SUB2_8 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x16_SUB2_4 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x16_SUB2_2 + KERNEL1x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x16_SUB2_1 + KERNEL1x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x16_SAVE + KERNEL1x16 + + MY_ALIGN +LSGEMM_1x16_SAVE: + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt+ LSGEMM_1x16_BEGIN + MY_ALIGN +LSGEMM_1x16_END: + andi. I, M, 8 + ble LSGEMM_1x8_END + + MY_ALIGN +LSGEMM_1x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x8 + ble LSGEMM_1x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x8_LOOP: + + KERNEL1x8_4 -2048,0, 0,0 + KERNEL1x8_4 -2048,0, 1,0 + KERNEL1x8_4 -2048,0, 2,0 + KERNEL1x8_4 -2048,0, 3,0 + KERNEL1x8_4 -2048,0, 4,0 + KERNEL1x8_4 -2048,0, 5,0 + KERNEL1x8_4 -2048,0, 6,0 + KERNEL1x8_4 -2048,0, 7,0 + KERNEL1x8_4 -2048,0, 8,0 + KERNEL1x8_4 -2048,0, 9,0 + KERNEL1x8_4 -2048,0, 10,0 + KERNEL1x8_4 -2048,0, 11,0 + KERNEL1x8_4 -2048,0, 12,0 + KERNEL1x8_4 -2048,0, 13,0 + KERNEL1x8_4 -2048,0, 14,0 + KERNEL1x8_4 -2048,0, 15,1 + + bdnz LSGEMM_1x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x8_SAVE + MY_ALIGN +LSGEMM_1x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x8_SUB2_16 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,0 + KERNEL1x8_4 0,0, 4,0 + KERNEL1x8_4 0,0, 5,0 + KERNEL1x8_4 0,0, 6,0 + KERNEL1x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x8_SUB2_8 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x8_SUB2_4 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x8_SUB2_2 + KERNEL1x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x8_SUB2_1 + KERNEL1x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x8_SAVE + KERNEL1x8 + + MY_ALIGN +LSGEMM_1x8_SAVE: + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 +#endif + MY_ALIGN +LSGEMM_1x8_END: + andi. I, M, 4 + ble LSGEMM_1x4_END + + MY_ALIGN +LSGEMM_1x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x4 + ble LSGEMM_1x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x4_LOOP: + + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,0 + KERNEL1x4_4 0,0, 8,0 + KERNEL1x4_4 0,0, 9,0 + KERNEL1x4_4 0,0, 10,0 + KERNEL1x4_4 0,0, 11,0 + KERNEL1x4_4 0,0, 12,0 + KERNEL1x4_4 0,0, 13,0 + KERNEL1x4_4 0,0, 14,0 + KERNEL1x4_4 0,0, 15,1 + + bdnz LSGEMM_1x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x4_SAVE + MY_ALIGN +LSGEMM_1x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x4_SUB2_16 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x4_SUB2_8 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x4_SUB2_4 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x4_SUB2_2 + KERNEL1x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x4_SUB2_1 + KERNEL1x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x4_SAVE + KERNEL1x4 + + MY_ALIGN +LSGEMM_1x4_SAVE: + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 +#endif + MY_ALIGN +LSGEMM_1x4_END: + andi. I, M, 2 + ble LSGEMM_1x2_END + + MY_ALIGN +LSGEMM_1x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x2 + ble LSGEMM_1x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x2_LOOP: + + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,0 + KERNEL1x2_4 0,0, 8,0 + KERNEL1x2_4 0,0, 9,0 + KERNEL1x2_4 0,0, 10,0 + KERNEL1x2_4 0,0, 11,0 + KERNEL1x2_4 0,0, 12,0 + KERNEL1x2_4 0,0, 13,0 + KERNEL1x2_4 0,0, 14,0 + KERNEL1x2_4 0,0, 15,1 + + bdnz LSGEMM_1x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x2_SAVE + MY_ALIGN +LSGEMM_1x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x2_SUB2_16 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x2_SUB2_8 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x2_SUB2_4 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x2_SUB2_2 + KERNEL1x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x2_SUB2_1 + KERNEL1x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x2_SAVE + KERNEL1x2 + + MY_ALIGN +LSGEMM_1x2_SAVE: + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 +#endif + MY_ALIGN +LSGEMM_1x2_END: + andi. I, M, 1 + ble LSGEMM_1x1_END + + MY_ALIGN +LSGEMM_1x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x1 + ble LSGEMM_1x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x1_LOOP: + + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,0 + KERNEL1x1_16 0,0, 2,0 + KERNEL1x1_16 0,0, 3,1 + + bdnz LSGEMM_1x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x1_SAVE + MY_ALIGN +LSGEMM_1x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x1_SUB2_16 + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,1 + MY_ALIGN +LSGEMM_1x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x1_SUB2_8 + KERNEL1x1_16 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x1_SUB2_4 + KERNEL1x1_8 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x1_SUB2_2 + KERNEL1x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x1_SUB2_1 + KERNEL1x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x1_SAVE + KERNEL1x1 + + MY_ALIGN +LSGEMM_1x1_SAVE: + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 +#endif + MY_ALIGN +LSGEMM_1x1_END: + slwi T1, K, 2 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif LSGEMM_END: \ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index 2c9e537c7..3750d338d 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -1,5575 +1,5575 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 4 -#define DISP64(ind,disp) (ind*unit_size*64+disp) -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -/********************************************************************************************** -* Macros for N=8 and M=16 -**********************************************************************************************/ - - - -.macro KERNEL8x16_L1_L4 Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - -.macro LOAD8x16 OffsetA,OffsetB - - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endm - -.macro END8x16_NORMAL - END8x16 0, AO, BO, 64,32 -.endm - -.macro END8x16_WITHOUT_ADD - END8x16 0, AO,BO,0,0 -.endm - -.macro END8x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - xvmulsp vs50, vs2,vs28 - xvmulsp vs51, vs3,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - xvmulsp vs54, vs2,vs29 - xvmulsp vs55, vs3,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - xvmulsp vs58, vs2,vs30 - xvmulsp vs59, vs3,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - xvmulsp vs62, vs2,vs31 - xvmulsp vs63, vs3,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - -.endif -.endm - -.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - -KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 -KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete - -.endm - -.macro KERNEL8x16 First - - LOAD8x16 0,0 - END8x16 \First, AO, BO, 64,32 -.endm - -.macro LOAD8x16_2 - LOAD8x16_2O AO,BO, 0,0 -.endm - -.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB - lxv vs8, (\OffsetB)(\BREG) - lxv vs12, (16+\OffsetB)(\BREG) - lxv vs24, (32+\OffsetB)(\BREG) - lxv vs28, (32+16+\OffsetB)(\BREG) - lxv vs4, (0+\OffsetA)(\AREG) - lxv vs5, (16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(\AREG) - lxv vs7, (48+\OffsetA)(\AREG) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(\AREG) - lxv vs1, (64+16+\OffsetA)(\AREG) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(\AREG) - lxv vs3, (64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - -.macro END8x16_2 - /*for load2 offset will be 128 and 64*/ - KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.if \Complete==0 - lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP16(\Index,\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - -.if \Complete==0 - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif -.if \Complete==0 - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,\OffsetB) - addi \AREG, \AREG, DISP32(\Index,\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - - -.macro SAVE8x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - - - - /* permute to restore butterfly rank 1 updateto normal promoted one */ - /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ - /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ - /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ - /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) -#endif - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 -#ifndef TRMMKERNEL - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) -#endif - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - - - -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 -#ifndef TRMMKERNEL - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 -#ifndef TRMMKERNEL - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - - - - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - - stxv vs32, 0(CO) - stxv vs33, 16(CO) -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - stxv vs34, 32(CO) - stxv vs35, 48(CO) -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T1) - stxv vs37, 16(T1) -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - stxv vs38, 32(T1) - stxv vs39, 48(T1) - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - - stxv vs40, 0(T2) - stxv vs41, 16(T2) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T2) - stxv vs43, 48(T2) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - stxv vs44, 0(T3) - stxv vs45, 16(T3) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - /*****the same with the second 8X8 ****/ - #ifndef TRMMKERNEL - lxv vs32, 0(T4) - lxv vs33, 16(T4) -#endif - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs34, 32(T4) - lxv vs35, 48(T4) -#endif - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs36, 0(T5) - lxv vs37, 16(T5) -#endif - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 -#ifndef TRMMKERNEL - lxv vs38,32(T5) - lxv vs39, 48(T5) -#endif - - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 -#ifndef TRMMKERNEL - lxv vs40, 0(T6) - lxv vs41, 16(T6) -#endif - xxmrglw vs16, vs50, vs62 - xxmrglw vs18, vs54, vs58 -#ifndef TRMMKERNEL - lxv vs42, 32(T6) - lxv vs43, 48(T6) -#endif - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - xxmrghw vs4, vs54, vs58 - xxmrghw vs5, vs50, vs62 -#ifndef TRMMKERNEL - lxv vs44, 0(T7) - lxv vs45, 16(T7) -#endif - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs51, vs63 - xxmrglw vs26, vs55, vs59 -#ifndef TRMMKERNEL - lxv vs46, 32(T7) - lxv vs47, 48(T7) -#endif - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - xxmrghw vs30, vs55, vs59 - xxmrghw vs31, vs51, vs63 - - - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - #ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - stxv vs32, 0(T4) - stxv vs33, 16(T4) - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - stxv vs34, 32(T4) - stxv vs35, 48(T4) - -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T5) - stxv vs37, 16(T5) - -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - - - stxv vs38, 32(T5) - stxv vs39, 48(T5) - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - stxv vs40, 0(T6) - stxv vs41, 16(T6) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T6) - stxv vs43, 48(T6) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - - stxv vs44, 0(T7) - stxv vs45, 16(T7) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - - stxv vs46, 32(T7) - stxv vs47, 48(T7) - - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=8 and M=8 -**********************************************************************************************/ - -.macro LOAD8x8_1 - LOAD8x8 1 -.endm - -.macro LOAD8x8_0 - LOAD8x8 0 -.endm - -.macro KERNEL8x8_L1_L4 Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END8x8_NORMAL - END8x8 0, AO, BO, 32,32 -.endm - -.macro Zero8X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - -.endm - -.macro LOAD8x8 Zero - - lxv vs24, 0(BO) - lxv vs28, 16(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endif -.endm - - -.macro END8x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.endm - -.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - -.if \Complete==0 - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP32(\Index,128) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endm - -.macro KERNEL8x8 First - - LOAD8x8 0 - END8x8 \First, AO, BO, 32,32 -.endm - -.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - - xvmulsp vs48, vs4,vs12 - xvmulsp vs49, vs5,vs12 - - xvmulsp vs52, vs4,vs13 - xvmulsp vs53, vs5,vs13 - - xvmulsp vs56, vs4,vs14 - xvmulsp vs57, vs5,vs14 - - xvmulsp vs60, vs4,vs15 - xvmulsp vs61, vs5,vs15 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endif - -.endm - - -.macro SAVE8x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - lxv vs50, 0(T4) - lxv vs51, 16(T4) - lxv vs54, 0(T5) - lxv vs55, 16(T5) - lxv vs58, 0(T6) - lxv vs59, 16(T6) - lxv vs62, 0(T7) - lxv vs63, 16(T7) -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 - - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 - stxv vs34, 0(CO) - stxv vs35, 16(CO) - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 - stxv vs38, 0(T1) - stxv vs39, 16(T1) - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 - stxv vs42, 0(T2) - stxv vs43, 16(T2) - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - stxv vs46, 0(T3) - stxv vs47, 16(T3) - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - #ifdef TRMMKERNEL - xvmulsp vs50, vs8, alpha_r - xvmulsp vs51, vs12, alpha_r - xvmulsp vs54, vs9, alpha_r - xvmulsp vs55, vs13, alpha_r - xvmulsp vs58, vs10, alpha_r - xvmulsp vs59, vs14, alpha_r - xvmulsp vs62, vs11, alpha_r - xvmulsp vs63, vs15, alpha_r -#else - xvmaddasp vs50, vs8, alpha_r - xvmaddasp vs51, vs12, alpha_r - xvmaddasp vs54, vs9, alpha_r - xvmaddasp vs55, vs13, alpha_r - xvmaddasp vs58, vs10, alpha_r - xvmaddasp vs59, vs14, alpha_r - xvmaddasp vs62, vs11, alpha_r - xvmaddasp vs63, vs15, alpha_r -#endif - - stxv vs50, 0(T4) - stxv vs51, 16(T4) - stxv vs54, 0(T5) - stxv vs55, 16(T5) - stxv vs58, 0(T6) - stxv vs59, 16(T6) - stxv vs62, 0(T7) - stxv vs63, 16(T7) - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=4 -**********************************************************************************************/ - -.macro LOAD8x4_1 - LOAD8x4 1 -.endm - -.macro LOAD8x4_0 - LOAD8x4 0 -.endm - -.macro KERNEL8x4_L1_L4 Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - -.endm - -.macro LOAD8x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - lxv vs25, 16(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 -.endif -.endm - -.macro END8x4_NORMAL - END8x4 0, AO, BO, 16,32 -.endm - -.macro END8x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.endif -.endm - -.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - -.macro KERNEL8x4 First - LOAD8x4 0 - END8x4 \First, AO, BO, 16,32 -.endm - -.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - xvmulsp vs48, vs27, vs4 - xvmulsp vs49, vs27, vs5 - xvmulsp vs50, vs27, vs6 - xvmulsp vs51, vs27, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - - -.macro SAVE8x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - add T4, T2, T10 - add T5, T3, T10 -#if !defined(TRMMKERNEL) - lxv vs40, 0(T4) - lxv vs41, 0(T5) -#endif - add T6, T4, T10 - add T7, T5, T10 -#if !defined(TRMMKERNEL) - lxv vs42, 0(T6) - lxv vs43, 0(T7) -#endif - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - - xxmrglw vs0, vs51,vs48 - xxmrglw vs1, vs50,vs49 - xxmrglw vs4, vs48,vs51 - xxmrglw vs5, vs49,vs50 - - xxmrghw vs2, vs51,vs48 - xxmrghw vs3, vs50,vs49 - xxmrghw vs6, vs48,vs51 - xxmrghw vs7, vs49,vs50 - - xxmrgld vs28, vs1, vs0 - xxmrghd vs29,vs5,vs4 - - xxmrgld vs30, vs2, vs3 - xxmrghd vs31,vs6,vs7 -#if defined(TRMMKERNEL) - - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r - xvmulsp vs40, vs28, alpha_r - xvmulsp vs41, vs29, alpha_r - xvmulsp vs42, vs30, alpha_r - xvmulsp vs43, vs31, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - xvmaddasp vs40, vs28, alpha_r - xvmaddasp vs41, vs29, alpha_r - xvmaddasp vs42, vs30, alpha_r - xvmaddasp vs43, vs31, alpha_r -#endif - - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - stxv vs40, 0(T4) - stxv vs41, 0(T5) - stxv vs42, 0(T6) - stxv vs43, 0(T7) - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=2 -**********************************************************************************************/ - - -.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero8x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - -.endm - -.macro KERNEL8x2 - KERNEL8x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP8(\Index,32) - -.endm - -.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs1, vs29, vs10 - xvmulsp vs2, vs28, vs11 - xvmulsp vs3, vs29, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs1, vs29, vs10 - xvmaddasp vs2, vs28, vs11 - xvmaddasp vs3, vs29, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE8x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - lxssp v8,0(T4) - lxssp v9,4(T4) - - lxssp v10,0(T5) - lxssp v11,4(T5) - - lxssp v12,0(T6) - lxssp v13,4(T6) - - lxssp v14,0(T7) - lxssp v15,4(T7) -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - xscvspdp vs9, vs3 - xxspltw vs10, vs3, 1 - xxspltw vs11, vs3, 2 - xxspltw vs12, vs3, 3 - xscvspdp vs10,vs10 - xscvspdp vs11,vs11 - xscvspdp vs12,vs12 - - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 - - - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - xsmuldp vs40,vs12, vs4 - xsmuldp vs41,vs31, vs4 - - xsmuldp vs42,vs11, vs4 - xsmuldp vs43,vs30, vs4 - - xsmuldp vs44,vs10, vs4 - xsmuldp vs45,vs29, vs4 - - xsmuldp vs46,vs9, vs4 - xsmuldp vs47,vs28, vs4 -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - xsmaddadp vs40,vs12, vs4 - xsmaddadp vs41,vs31, vs4 - - xsmaddadp vs42,vs11, vs4 - xsmaddadp vs43,vs30, vs4 - - xsmaddadp vs44,vs10, vs4 - xsmaddadp vs45,vs29, vs4 - - xsmaddadp vs46,vs9, vs4 - xsmaddadp vs47,vs28, vs4 -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - stxssp v8,0(T4) - stxssp v9,4(T4) - - stxssp v10,0(T5) - stxssp v11,4(T5) - - stxssp v12,0(T6) - stxssp v13,4(T6) - - stxssp v14,0(T7) - stxssp v15,4(T7) - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=1 -**********************************************************************************************/ -.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero8x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 -.endm - -.macro KERNEL8x1 - KERNEL8x1_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_2 - KERNEL8x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL8x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) - lxv vs28, 32(\BREG) - lxv vs29, 48(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 64 -.endm - -.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) - lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) - lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) - lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) - lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs1, vs31, vs10 - xvmulsp vs0, vs32, vs11 - xvmulsp vs1, vs33, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs0, vs32, vs11 - xvmaddasp vs1, vs33, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP32(\Index,128) -.endif -.endm - -.macro SAVE8x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) - lxssp v8,0(T4) - lxssp v10,0(T5) - lxssp v12,0(T6) - lxssp v14,0(T7) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 - xsmuldp vs40,vs31, vs4 - xsmuldp vs42,vs30, vs4 - xsmuldp vs44,vs29, vs4 - xsmuldp vs46,vs28, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 - xsmaddadp vs40,vs31, vs4 - xsmaddadp vs42,vs30, vs4 - xsmaddadp vs44,vs29, vs4 - xsmaddadp vs46,vs28, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - stxssp v8,0(T4) - stxssp v10,0(T5) - stxssp v12,0(T6) - stxssp v14,0(T7) - addi CO,CO,4 -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=16 -**********************************************************************************************/ - -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm - -.macro KERNEL4x16_L1_L4 Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - -.endif -.endm - -.macro END4x16_NORMAL - END4x16 0, AO, BO, 64,16 -.endm - -.macro END4x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - -.endif -.endm - -.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP64(\Index,256) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endm - -.macro KERNEL4x16 First - - LOAD4x16 0 - END4x16 \First, AO, BO, 64,16 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - xvmulsp vs34, vs6,vs8 - xvmulsp vs35, vs7,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - xvmulsp vs38, vs6,vs9 - xvmulsp vs39, vs7,vs9 -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - xvmulsp vs42, vs6,vs10 - xvmulsp vs43, vs7,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - xvmulsp vs46, vs6,vs11 - xvmulsp vs47, vs7,vs11 - - - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endif - -.endm - - -.macro SAVE4x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - -#endif - - stxv vs32, 0(CO) - stxv vs33, 16(CO) - stxv vs34, 32(CO) - stxv vs35, 48(CO) - - stxv vs36, 0(T1) - stxv vs37, 16(T1) - stxv vs38, 32(T1) - stxv vs39, 48(T1) - - stxv vs40, 0(T2) - stxv vs41, 16(T2) - stxv vs42, 32(T2) - stxv vs43, 48(T2) - stxv vs44, 0(T3) - stxv vs45, 16(T3) - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=8 -**********************************************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm - -.macro KERNEL4x8_L1_L4 Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END4x8_NORMAL - END4x8 0, AO, BO, 32,16 -.endm - -.macro Zero4X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endm - -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endif -.endm - - -.macro END4x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.endm - -.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - -.endm - -.macro KERNEL4x8 First - - LOAD4x8 0 - END4x8 \First, AO, BO, 32,16 -.endm - -.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - -.endif - -.endm - - -.macro SAVE4x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - stxv vs34, 0(CO) - stxv vs35, 16(CO) - stxv vs38, 0(T1) - stxv vs39, 16(T1) - stxv vs42, 0(T2) - stxv vs43, 16(T2) - stxv vs46, 0(T3) - stxv vs47, 16(T3) - - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=4 -**********************************************************************************************/ - -.macro LOAD4x4_1 - LOAD4x4 1 -.endm - -.macro LOAD4x4_0 - LOAD4x4 0 -.endm - -.macro KERNEL4x4_L1_L4 Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - -.macro LOAD4x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endif -.endm - -.macro END4x4_NORMAL - END4x4 0, AO, BO, 16,16 -.endm - -.macro END4x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.endif -.endm - -.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - -.macro KERNEL4x4 First - LOAD4x4 0 - END4x4 \First, AO, BO, 16,16 -.endm - -.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) - -.endif -.endif - - -.endm - - -.macro SAVE4x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - #if defined(TRMMKERNEL) - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - #endif - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=2 -**********************************************************************************************/ - - -.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero4x2 - xxlxor vs0, vs0, vs0 - xxlxor vs2, vs2, vs2 - -.endm - -.macro KERNEL4x2 - KERNEL4x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP4(\Index,16) - -.endm - -.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs2, vs28, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs2, vs28, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE4x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=1 -**********************************************************************************************/ -.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero4x1 - xxlxor vs0, vs0, vs0 -.endm - -.macro KERNEL4x1 - KERNEL4x1_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_2 - KERNEL4x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 16 -.endm - -.macro KERNEL4x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs28, 16(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs0, vs32, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs0, vs32, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif -.endm - -.macro SAVE4x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - addi CO,CO,4 -.endm - -/****************************N=2 section*****************/ - -.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - xxlxor vs6, vs6, vs6 - xxlxor vs7, vs7, vs7 -.endm - -.macro KERNEL2x16 - KERNEL2x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - xvmulsp vs6, vs28, vs9 - xvmulsp vs7, vs29, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs2, vs32, vs12 - xvmaddasp vs3, vs33, vs12 - - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - xvmaddasp vs6, vs32, vs13 - xvmaddasp vs7, vs33, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs2, vs36, vs14 - xvmaddasp vs3, vs37, vs14 - - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - xvmaddasp vs6, vs36, vs15 - xvmaddasp vs7, vs37, vs15 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE2x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - lxv vs28, 32(T1) - lxv vs29, 48(T1) -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r - xvmulsp vs28, vs6, alpha_r - xvmulsp vs29, vs7, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r - xvmaddasp vs28, vs6, alpha_r - xvmaddasp vs29, vs7, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - stxv vs28, 32(T1) - stxv vs29, 48(T1) - - addi CO,CO,64 - -.endm - -/* M=8 N=2 */ - -.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x8 - KERNEL2x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE2x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - - addi CO,CO,32 - -.endm - - -/*M=4*/ - - -.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - /* we will aggregate on save vs0 +vs4 vs11+vs5 */ -.macro Zero2x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x4 - KERNEL2x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs30, vs13 - xvmaddasp vs4, vs34, vs14 - xvmaddasp vs5, vs34, vs15 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE2x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - -#endif - /*aggregate vectors*/ - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs26, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs26, vs1, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs26, 0(T1) - - addi CO,CO,16 - -.endm - - -/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ -.macro SWITCH_PERMUTE_INNER - xxpermdi permute_mask, permute_mask, permute_mask,2 -.endm - -.macro Zero2x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - SWITCH_PERMUTE_INNER -.endm - -.macro KERNEL2x2 - KERNEL2x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxperm vs9, vs36, permute_mask - lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs37, vs36 - xvmulsp vs1, vs37, vs9 - -.else - xvmaddasp vs0, vs37, vs36 - xvmaddasp vs1, vs37, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP2(\Index,8) - -.endm - - - - -.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - xxperm vs11, vs10, permute_mask - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs16, vs11 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - -.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP4(\Index,16) -.endif -.endm - - -.macro SAVE2x2 - -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) - -#endif - /*aggregate vectors*/ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - /* */ - /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ - xxperm vs1,vs1, permute_mask - - - xxmrghw vs2 ,vs1,vs0 - xxpermdi vs2,vs2,vs2,2 - xxmrghw vs3 ,vs0,vs1 -#if defined(TRMMKERNEL) - xvmulsp vs36, vs2, alpha_r - xvmulsp vs37, vs3, alpha_r -#else - xvmaddasp vs36, vs2, alpha_r - xvmaddasp vs37, vs3, alpha_r -#endif - /**** store last two words*/ - - - stxsd v4, 0(CO) - stxsd v5, 0(T1) - - addi CO,CO,8 - -.endm - -/*--------------------------- M=1 N=2 */ -.macro Zero2x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL2x1 - KERNEL2x1_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs2, vs37, vs35 - xvmulsp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP2(\Index,8) -.endm - - -.macro SAVE2x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxssp v5 , 0(T1) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 2x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 0(T1) - - addi CO,CO,4 - -.endm - - - -/****************************N=1 section*****************/ - -.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x16 - KERNEL1x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs2, vs32, vs10 - xvmaddasp vs3, vs33, vs10 - - - xvmaddasp vs0, vs34, vs11 - xvmaddasp vs1, vs35, vs11 - xvmaddasp vs2, vs36, vs11 - xvmaddasp vs3, vs37, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE1x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - addi CO,CO,64 - -.endm - -/* M=8 N=1 */ - -.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x8 - KERNEL1x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - - - xvmaddasp vs2, vs34, vs11 - xvmaddasp vs3, vs35, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE1x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - /* aggregate vs0 vs2 and vs1 vs3*/ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - addi CO,CO,32 - -.endm -/*M=4*/ - -.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x4 - KERNEL1x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - - xvmaddasp vs1, vs27, vs9 - - xvmaddasp vs2, vs30, vs10 - - - xvmaddasp vs3, vs31, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE1x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - /* aggregate */ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 - xvaddsp vs0,vs1,vs0 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r -#endif - stxv vs16, 0(CO) - - addi CO,CO,16 - -.endm - -/* M=2 N=1*/ -.macro Zero1x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL1x2 - KERNEL1x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs2, vs37, vs35 - xvmuldp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x2 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - lxssp v5 , 4(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 1x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 4(CO) - - addi CO,CO,8 - -.endm -/*///////////////// N=1 M=1 //////////////////*/ -.macro Zero1x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2,vs2 - xxlxor vs3,vs3,vs3 - xxlxor vs4,vs4,vs4 -.endm - -.macro KERNEL1x1 - KERNEL1x1_1 AO,BO, 1, 0,0,0 -.endm - -.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone ( FIRST==1 to zero vs4) - */ -.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs4, vs37, vs35 - -.else - xsmaddadp vs4, vs37, vs35 - .endif - - addi \AREG, \AREG, DISP1(\Index,4) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - -.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) - lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) - lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) - lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) - lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - xvmaddasp vs2, vs10, vs17 - xvmaddasp vs3, vs11, vs18 -.if \IsLast==1 - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs8, vs26 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) - lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs36, vs37 - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors */ - xvaddsp vs0,vs0,vs1 - xvaddsp vs2,vs2,vs3 - xvaddsp vs0,vs0,vs2 - - xxpermdi vs7,vs0,vs0,2 - xvaddsp vs0,vs0,vs7 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs7,vs5,vs6 - xsadddp vs4,vs4,vs7 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs4, vs16 - -#else - xsmaddadp vs36,vs4, vs16 -#endif - - stxssp v4, 0(CO) - - addi CO,CO,4 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 3 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 2 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 4 +#define DISP64(ind,disp) (ind*unit_size*64+disp) +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + + + +.macro KERNEL8x16_L1_L4 Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD8x16 OffsetA,OffsetB + + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endm + +.macro END8x16_NORMAL + END8x16 0, AO, BO, 64,32 +.endm + +.macro END8x16_WITHOUT_ADD + END8x16 0, AO,BO,0,0 +.endm + +.macro END8x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.endm + +.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + +KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete + +.endm + +.macro KERNEL8x16 First + + LOAD8x16 0,0 + END8x16 \First, AO, BO, 64,32 +.endm + +.macro LOAD8x16_2 + LOAD8x16_2O AO,BO, 0,0 +.endm + +.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB + lxv vs8, (\OffsetB)(\BREG) + lxv vs12, (16+\OffsetB)(\BREG) + lxv vs24, (32+\OffsetB)(\BREG) + lxv vs28, (32+16+\OffsetB)(\BREG) + lxv vs4, (0+\OffsetA)(\AREG) + lxv vs5, (16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(\AREG) + lxv vs7, (48+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(\AREG) + lxv vs1, (64+16+\OffsetA)(\AREG) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(\AREG) + lxv vs3, (64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + +.macro END8x16_2 + /*for load2 offset will be 128 and 64*/ + KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.if \Complete==0 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + +.if \Complete==0 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,\OffsetB) + addi \AREG, \AREG, DISP32(\Index,\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + + +.macro SAVE8x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + + + + /* permute to restore butterfly rank 1 updateto normal promoted one */ + /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ + /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ + /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ + /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) +#endif + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 +#ifndef TRMMKERNEL + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) +#endif + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + + + +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 +#ifndef TRMMKERNEL + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 +#ifndef TRMMKERNEL + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + + + + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + + stxv vs32, 0(CO) + stxv vs33, 16(CO) +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + stxv vs34, 32(CO) + stxv vs35, 48(CO) +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T1) + stxv vs37, 16(T1) +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + stxv vs38, 32(T1) + stxv vs39, 48(T1) + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + + stxv vs40, 0(T2) + stxv vs41, 16(T2) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T2) + stxv vs43, 48(T2) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + stxv vs44, 0(T3) + stxv vs45, 16(T3) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + /*****the same with the second 8X8 ****/ + #ifndef TRMMKERNEL + lxv vs32, 0(T4) + lxv vs33, 16(T4) +#endif + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs34, 32(T4) + lxv vs35, 48(T4) +#endif + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs36, 0(T5) + lxv vs37, 16(T5) +#endif + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 +#ifndef TRMMKERNEL + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) +#endif + xxmrglw vs16, vs50, vs62 + xxmrglw vs18, vs54, vs58 +#ifndef TRMMKERNEL + lxv vs42, 32(T6) + lxv vs43, 48(T6) +#endif + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + xxmrghw vs4, vs54, vs58 + xxmrghw vs5, vs50, vs62 +#ifndef TRMMKERNEL + lxv vs44, 0(T7) + lxv vs45, 16(T7) +#endif + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs51, vs63 + xxmrglw vs26, vs55, vs59 +#ifndef TRMMKERNEL + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + xxmrghw vs30, vs55, vs59 + xxmrghw vs31, vs51, vs63 + + + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + #ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + stxv vs32, 0(T4) + stxv vs33, 16(T4) + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + stxv vs34, 32(T4) + stxv vs35, 48(T4) + +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T5) + stxv vs37, 16(T5) + +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + + + stxv vs38, 32(T5) + stxv vs39, 48(T5) + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + stxv vs40, 0(T6) + stxv vs41, 16(T6) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T6) + stxv vs43, 48(T6) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + + stxv vs44, 0(T7) + stxv vs45, 16(T7) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + + stxv vs46, 32(T7) + stxv vs47, 48(T7) + + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + LOAD8x8 1 +.endm + +.macro LOAD8x8_0 + LOAD8x8 0 +.endm + +.macro KERNEL8x8_L1_L4 Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END8x8_NORMAL + END8x8 0, AO, BO, 32,32 +.endm + +.macro Zero8X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + +.endm + +.macro LOAD8x8 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endif +.endm + + +.macro END8x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.endm + +.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + +.if \Complete==0 + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endm + +.macro KERNEL8x8 First + + LOAD8x8 0 + END8x8 \First, AO, BO, 32,32 +.endm + +.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endif + +.endm + + +.macro SAVE8x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + lxv vs50, 0(T4) + lxv vs51, 16(T4) + lxv vs54, 0(T5) + lxv vs55, 16(T5) + lxv vs58, 0(T6) + lxv vs59, 16(T6) + lxv vs62, 0(T7) + lxv vs63, 16(T7) +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + stxv vs34, 0(CO) + stxv vs35, 16(CO) + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + stxv vs38, 0(T1) + stxv vs39, 16(T1) + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + stxv vs42, 0(T2) + stxv vs43, 16(T2) + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + stxv vs46, 0(T3) + stxv vs47, 16(T3) + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + #ifdef TRMMKERNEL + xvmulsp vs50, vs8, alpha_r + xvmulsp vs51, vs12, alpha_r + xvmulsp vs54, vs9, alpha_r + xvmulsp vs55, vs13, alpha_r + xvmulsp vs58, vs10, alpha_r + xvmulsp vs59, vs14, alpha_r + xvmulsp vs62, vs11, alpha_r + xvmulsp vs63, vs15, alpha_r +#else + xvmaddasp vs50, vs8, alpha_r + xvmaddasp vs51, vs12, alpha_r + xvmaddasp vs54, vs9, alpha_r + xvmaddasp vs55, vs13, alpha_r + xvmaddasp vs58, vs10, alpha_r + xvmaddasp vs59, vs14, alpha_r + xvmaddasp vs62, vs11, alpha_r + xvmaddasp vs63, vs15, alpha_r +#endif + + stxv vs50, 0(T4) + stxv vs51, 16(T4) + stxv vs54, 0(T5) + stxv vs55, 16(T5) + stxv vs58, 0(T6) + stxv vs59, 16(T6) + stxv vs62, 0(T7) + stxv vs63, 16(T7) + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + LOAD8x4 1 +.endm + +.macro LOAD8x4_0 + LOAD8x4 0 +.endm + +.macro KERNEL8x4_L1_L4 Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + +.endm + +.macro LOAD8x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + lxv vs25, 16(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 +.endif +.endm + +.macro END8x4_NORMAL + END8x4 0, AO, BO, 16,32 +.endm + +.macro END8x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.endif +.endm + +.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + +.macro KERNEL8x4 First + LOAD8x4 0 + END8x4 \First, AO, BO, 16,32 +.endm + +.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + xvmulsp vs48, vs27, vs4 + xvmulsp vs49, vs27, vs5 + xvmulsp vs50, vs27, vs6 + xvmulsp vs51, vs27, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + + +.macro SAVE8x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + add T4, T2, T10 + add T5, T3, T10 +#if !defined(TRMMKERNEL) + lxv vs40, 0(T4) + lxv vs41, 0(T5) +#endif + add T6, T4, T10 + add T7, T5, T10 +#if !defined(TRMMKERNEL) + lxv vs42, 0(T6) + lxv vs43, 0(T7) +#endif + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + + xxmrglw vs0, vs51,vs48 + xxmrglw vs1, vs50,vs49 + xxmrglw vs4, vs48,vs51 + xxmrglw vs5, vs49,vs50 + + xxmrghw vs2, vs51,vs48 + xxmrghw vs3, vs50,vs49 + xxmrghw vs6, vs48,vs51 + xxmrghw vs7, vs49,vs50 + + xxmrgld vs28, vs1, vs0 + xxmrghd vs29,vs5,vs4 + + xxmrgld vs30, vs2, vs3 + xxmrghd vs31,vs6,vs7 +#if defined(TRMMKERNEL) + + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r + xvmulsp vs40, vs28, alpha_r + xvmulsp vs41, vs29, alpha_r + xvmulsp vs42, vs30, alpha_r + xvmulsp vs43, vs31, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + xvmaddasp vs40, vs28, alpha_r + xvmaddasp vs41, vs29, alpha_r + xvmaddasp vs42, vs30, alpha_r + xvmaddasp vs43, vs31, alpha_r +#endif + + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + stxv vs40, 0(T4) + stxv vs41, 0(T5) + stxv vs42, 0(T6) + stxv vs43, 0(T7) + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + + +.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero8x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + +.macro KERNEL8x2 + KERNEL8x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP8(\Index,32) + +.endm + +.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs1, vs29, vs10 + xvmulsp vs2, vs28, vs11 + xvmulsp vs3, vs29, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs1, vs29, vs10 + xvmaddasp vs2, vs28, vs11 + xvmaddasp vs3, vs29, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE8x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + lxssp v8,0(T4) + lxssp v9,4(T4) + + lxssp v10,0(T5) + lxssp v11,4(T5) + + lxssp v12,0(T6) + lxssp v13,4(T6) + + lxssp v14,0(T7) + lxssp v15,4(T7) +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + xscvspdp vs9, vs3 + xxspltw vs10, vs3, 1 + xxspltw vs11, vs3, 2 + xxspltw vs12, vs3, 3 + xscvspdp vs10,vs10 + xscvspdp vs11,vs11 + xscvspdp vs12,vs12 + + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 + + + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + xsmuldp vs40,vs12, vs4 + xsmuldp vs41,vs31, vs4 + + xsmuldp vs42,vs11, vs4 + xsmuldp vs43,vs30, vs4 + + xsmuldp vs44,vs10, vs4 + xsmuldp vs45,vs29, vs4 + + xsmuldp vs46,vs9, vs4 + xsmuldp vs47,vs28, vs4 +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + xsmaddadp vs40,vs12, vs4 + xsmaddadp vs41,vs31, vs4 + + xsmaddadp vs42,vs11, vs4 + xsmaddadp vs43,vs30, vs4 + + xsmaddadp vs44,vs10, vs4 + xsmaddadp vs45,vs29, vs4 + + xsmaddadp vs46,vs9, vs4 + xsmaddadp vs47,vs28, vs4 +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + stxssp v8,0(T4) + stxssp v9,4(T4) + + stxssp v10,0(T5) + stxssp v11,4(T5) + + stxssp v12,0(T6) + stxssp v13,4(T6) + + stxssp v14,0(T7) + stxssp v15,4(T7) + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ +.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero8x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + +.macro KERNEL8x1 + KERNEL8x1_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_2 + KERNEL8x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL8x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) + lxv vs28, 32(\BREG) + lxv vs29, 48(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 64 +.endm + +.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) + lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) + lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) + lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) + lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs1, vs31, vs10 + xvmulsp vs0, vs32, vs11 + xvmulsp vs1, vs33, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs0, vs32, vs11 + xvmaddasp vs1, vs33, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP32(\Index,128) +.endif +.endm + +.macro SAVE8x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) + lxssp v8,0(T4) + lxssp v10,0(T5) + lxssp v12,0(T6) + lxssp v14,0(T7) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 + xsmuldp vs40,vs31, vs4 + xsmuldp vs42,vs30, vs4 + xsmuldp vs44,vs29, vs4 + xsmuldp vs46,vs28, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 + xsmaddadp vs40,vs31, vs4 + xsmaddadp vs42,vs30, vs4 + xsmaddadp vs44,vs29, vs4 + xsmaddadp vs46,vs28, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + stxssp v8,0(T4) + stxssp v10,0(T5) + stxssp v12,0(T6) + stxssp v14,0(T7) + addi CO,CO,4 +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm + +.macro KERNEL4x16_L1_L4 Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + +.endif +.endm + +.macro END4x16_NORMAL + END4x16 0, AO, BO, 64,16 +.endm + +.macro END4x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + +.endif +.endm + +.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endm + +.macro KERNEL4x16 First + + LOAD4x16 0 + END4x16 \First, AO, BO, 64,16 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endif + +.endm + + +.macro SAVE4x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm + +.macro KERNEL4x8_L1_L4 Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END4x8_NORMAL + END4x8 0, AO, BO, 32,16 +.endm + +.macro Zero4X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endm + +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endif +.endm + + +.macro END4x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.endm + +.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + +.endm + +.macro KERNEL4x8 First + + LOAD4x8 0 + END4x8 \First, AO, BO, 32,16 +.endm + +.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + +.endif + +.endm + + +.macro SAVE4x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + stxv vs34, 0(CO) + stxv vs35, 16(CO) + stxv vs38, 0(T1) + stxv vs39, 16(T1) + stxv vs42, 0(T2) + stxv vs43, 16(T2) + stxv vs46, 0(T3) + stxv vs47, 16(T3) + + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + LOAD4x4 1 +.endm + +.macro LOAD4x4_0 + LOAD4x4 0 +.endm + +.macro KERNEL4x4_L1_L4 Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + +.macro LOAD4x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endif +.endm + +.macro END4x4_NORMAL + END4x4 0, AO, BO, 16,16 +.endm + +.macro END4x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.endif +.endm + +.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + +.macro KERNEL4x4 First + LOAD4x4 0 + END4x4 \First, AO, BO, 16,16 +.endm + +.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) + +.endif +.endif + + +.endm + + +.macro SAVE4x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + #if defined(TRMMKERNEL) + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + #endif + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + + +.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero4x2 + xxlxor vs0, vs0, vs0 + xxlxor vs2, vs2, vs2 + +.endm + +.macro KERNEL4x2 + KERNEL4x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP4(\Index,16) + +.endm + +.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs2, vs28, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs2, vs28, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE4x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ +.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero4x1 + xxlxor vs0, vs0, vs0 +.endm + +.macro KERNEL4x1 + KERNEL4x1_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_2 + KERNEL4x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 16 +.endm + +.macro KERNEL4x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs28, 16(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs0, vs32, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs0, vs32, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif +.endm + +.macro SAVE4x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + addi CO,CO,4 +.endm + +/****************************N=2 section*****************/ + +.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 +.endm + +.macro KERNEL2x16 + KERNEL2x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + xvmulsp vs6, vs28, vs9 + xvmulsp vs7, vs29, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs2, vs32, vs12 + xvmaddasp vs3, vs33, vs12 + + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + xvmaddasp vs6, vs32, vs13 + xvmaddasp vs7, vs33, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs2, vs36, vs14 + xvmaddasp vs3, vs37, vs14 + + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + xvmaddasp vs6, vs36, vs15 + xvmaddasp vs7, vs37, vs15 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE2x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + lxv vs28, 32(T1) + lxv vs29, 48(T1) +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r + xvmulsp vs28, vs6, alpha_r + xvmulsp vs29, vs7, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r + xvmaddasp vs28, vs6, alpha_r + xvmaddasp vs29, vs7, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + stxv vs28, 32(T1) + stxv vs29, 48(T1) + + addi CO,CO,64 + +.endm + +/* M=8 N=2 */ + +.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x8 + KERNEL2x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE2x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + + addi CO,CO,32 + +.endm + + +/*M=4*/ + + +.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + /* we will aggregate on save vs0 +vs4 vs11+vs5 */ +.macro Zero2x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x4 + KERNEL2x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs30, vs13 + xvmaddasp vs4, vs34, vs14 + xvmaddasp vs5, vs34, vs15 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE2x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + +#endif + /*aggregate vectors*/ + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs26, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs26, vs1, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs26, 0(T1) + + addi CO,CO,16 + +.endm + + +/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ +.macro SWITCH_PERMUTE_INNER + xxpermdi permute_mask, permute_mask, permute_mask,2 +.endm + +.macro Zero2x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + SWITCH_PERMUTE_INNER +.endm + +.macro KERNEL2x2 + KERNEL2x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxperm vs9, vs36, permute_mask + lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs37, vs36 + xvmulsp vs1, vs37, vs9 + +.else + xvmaddasp vs0, vs37, vs36 + xvmaddasp vs1, vs37, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP2(\Index,8) + +.endm + + + + +.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + xxperm vs11, vs10, permute_mask + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs16, vs11 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + +.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP4(\Index,16) +.endif +.endm + + +.macro SAVE2x2 + +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) + +#endif + /*aggregate vectors*/ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + /* */ + /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ + xxperm vs1,vs1, permute_mask + + + xxmrghw vs2 ,vs1,vs0 + xxpermdi vs2,vs2,vs2,2 + xxmrghw vs3 ,vs0,vs1 +#if defined(TRMMKERNEL) + xvmulsp vs36, vs2, alpha_r + xvmulsp vs37, vs3, alpha_r +#else + xvmaddasp vs36, vs2, alpha_r + xvmaddasp vs37, vs3, alpha_r +#endif + /**** store last two words*/ + + + stxsd v4, 0(CO) + stxsd v5, 0(T1) + + addi CO,CO,8 + +.endm + +/*--------------------------- M=1 N=2 */ +.macro Zero2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL2x1 + KERNEL2x1_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs2, vs37, vs35 + xvmulsp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP2(\Index,8) +.endm + + +.macro SAVE2x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxssp v5 , 0(T1) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 2x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 0(T1) + + addi CO,CO,4 + +.endm + + + +/****************************N=1 section*****************/ + +.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x16 + KERNEL1x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs2, vs32, vs10 + xvmaddasp vs3, vs33, vs10 + + + xvmaddasp vs0, vs34, vs11 + xvmaddasp vs1, vs35, vs11 + xvmaddasp vs2, vs36, vs11 + xvmaddasp vs3, vs37, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE1x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + addi CO,CO,64 + +.endm + +/* M=8 N=1 */ + +.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x8 + KERNEL1x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + + + xvmaddasp vs2, vs34, vs11 + xvmaddasp vs3, vs35, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE1x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + /* aggregate vs0 vs2 and vs1 vs3*/ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + addi CO,CO,32 + +.endm +/*M=4*/ + +.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x4 + KERNEL1x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + + xvmaddasp vs1, vs27, vs9 + + xvmaddasp vs2, vs30, vs10 + + + xvmaddasp vs3, vs31, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE1x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + /* aggregate */ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 + xvaddsp vs0,vs1,vs0 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r +#endif + stxv vs16, 0(CO) + + addi CO,CO,16 + +.endm + +/* M=2 N=1*/ +.macro Zero1x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL1x2 + KERNEL1x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs2, vs37, vs35 + xvmuldp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x2 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + lxssp v5 , 4(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 1x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 4(CO) + + addi CO,CO,8 + +.endm +/*///////////////// N=1 M=1 //////////////////*/ +.macro Zero1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2,vs2 + xxlxor vs3,vs3,vs3 + xxlxor vs4,vs4,vs4 +.endm + +.macro KERNEL1x1 + KERNEL1x1_1 AO,BO, 1, 0,0,0 +.endm + +.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone ( FIRST==1 to zero vs4) + */ +.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs4, vs37, vs35 + +.else + xsmaddadp vs4, vs37, vs35 + .endif + + addi \AREG, \AREG, DISP1(\Index,4) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + +.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) + lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) + lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) + lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) + lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + xvmaddasp vs2, vs10, vs17 + xvmaddasp vs3, vs11, vs18 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs8, vs26 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) + lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs36, vs37 + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors */ + xvaddsp vs0,vs0,vs1 + xvaddsp vs2,vs2,vs3 + xvaddsp vs0,vs0,vs2 + + xxpermdi vs7,vs0,vs0,2 + xvaddsp vs0,vs0,vs7 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs7,vs5,vs6 + xsadddp vs4,vs4,vs7 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs4, vs16 + +#else + xsmaddadp vs36,vs4, vs16 +#endif + + stxssp v4, 0(CO) + + addi CO,CO,4 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 3 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 2 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 5dfb18f5b..f5c1ba729 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -1,470 +1,470 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_n.c" - -#else - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float v_x4 = {x4,x4,x4,x4}; - __vector float v_x5 = {x5,x5,x5,x5}; - __vector float v_x6 = {x6,x6,x6,x6}; - __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i++) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i++ ) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - } - -} - - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); - } - - - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); - } - - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); - } - - - return(0); -} - -#endif - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" + +#else + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float v_x4 = {x4,x4,x4,x4}; + __vector float v_x5 = {x5,x5,x5,x5}; + __vector float v_x6 = {x6,x6,x6,x6}; + __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i++) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i++ ) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + } + +} + + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + +#endif + diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c index 64696236a..0edb79129 100644 --- a/kernel/power/sgemv_n_8.c +++ b/kernel/power/sgemv_n_8.c @@ -1,514 +1,514 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could provide barebone for switching to inline assembly -*/ - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - register __vector float v_x0 = {x0,x0,x0,x0}; - register __vector float v_x1 = {x1,x1,x1,x1}; - register __vector float v_x2 = {x2,x2,x2,x2}; - register __vector float v_x3 = {x3,x3,x3,x3}; - register __vector float v_x4 = {x4,x4,x4,x4}; - register __vector float v_x5 = {x5,x5,x5,x5}; - register __vector float v_x6 = {x6,x6,x6,x6}; - register __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i+=2) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float vb0_1=vb0[i] ; - register __vector float vb0_2=vb0[i+1] ; - register __vector float vb1_1=vb1[i] ; - register __vector float vb1_2=vb1[i+1] ; - register __vector float vb2_1=vb2[i] ; - register __vector float vb2_2=vb2[i+1] ; - register __vector float vb3_1=vb3[i] ; - register __vector float vb3_2=vb3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i+=2 ) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } - -} - - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 7 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - - if ( m3 & 4 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - if ( lda == 4 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; - temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; - - temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; - temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; - temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; - temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; - - a_ptr += 16; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0] ; - a_ptr +=4; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - y_ptr += inc_y; - y_ptr[0] += alpha * temp3; - y_ptr += inc_y; - a += 4; - } - - - if ( m3 & 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - a += 2; - } - - if ( m3 & 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - - - } - - - return(0); -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could provide barebone for switching to inline assembly +*/ + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + register __vector float v_x0 = {x0,x0,x0,x0}; + register __vector float v_x1 = {x1,x1,x1,x1}; + register __vector float v_x2 = {x2,x2,x2,x2}; + register __vector float v_x3 = {x3,x3,x3,x3}; + register __vector float v_x4 = {x4,x4,x4,x4}; + register __vector float v_x5 = {x5,x5,x5,x5}; + register __vector float v_x6 = {x6,x6,x6,x6}; + register __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i+=2) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float vb0_1=vb0[i] ; + register __vector float vb0_2=vb0[i+1] ; + register __vector float vb1_1=vb1[i] ; + register __vector float vb1_2=vb1[i+1] ; + register __vector float vb2_1=vb2[i] ; + register __vector float vb2_2=vb2[i+1] ; + register __vector float vb3_1=vb3[i] ; + register __vector float vb3_2=vb3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i+=2 ) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } + +} + + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 7 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + + if ( m3 & 4 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + if ( lda == 4 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; + temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; + + temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; + temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; + temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; + temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; + + a_ptr += 16; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0] ; + a_ptr +=4; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + y_ptr += inc_y; + y_ptr[0] += alpha * temp3; + y_ptr += inc_y; + a += 4; + } + + + if ( m3 & 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + a += 2; + } + + if ( m3 & 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + + + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 62c517a9d..c3fc8e77a 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -1,484 +1,484 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_t.c" - -#else - -#include "common.h" - -#define NBMAX 2048 - -#include - -static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - temp4 += v_x[i] * va4[i]; - temp5 += v_x[i] * va5[i]; - temp6 += v_x[i] * va6[i]; - temp7 += v_x[i] * va7[i]; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i] ; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 == 3) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 3 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - aj += 3; - } - - } else { - - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr += inc_y; - aj += lda; - } - - } - - } - return (0); - } - - if (m3 == 2) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - return (0); - - } - - FLOAT xtemp = *x_ptr * alpha; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - } - - return (0); - -} - -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else + +#include "common.h" + +#define NBMAX 2048 + +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i] ; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + +#endif diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index b90512162..1ee7c8aeb 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -1,508 +1,508 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could be used as base for switching to inline assembly -*/ - -#include "common.h" -#include -#define NBMAX 4096 - -#include - -static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i +=2) { - register __vector float vx1=v_x[i] ; - register __vector float vx2=v_x[i+1] ; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float va4_1=va4[i] ; - register __vector float va4_2=va4[i+1] ; - register __vector float va5_1=va5[i] ; - register __vector float va5_2=va5[i+1] ; - register __vector float va6_1=va6[i] ; - register __vector float va6_2=va6[i+1] ; - register __vector float va7_1=va7[i] ; - register __vector float va7_2=va7[i+1] ; - temp0 += vx1* va0_1 + vx2 * va0_2; - temp1 += vx1* va1_1 + vx2 * va1_2; - temp2 += vx1* va2_1 + vx2 * va2_2; - temp3 += vx1* va3_1 + vx2 * va3_2; - temp4 += vx1* va4_1 + vx2 * va4_2; - temp5 += vx1* va5_1 + vx2 * va5_2; - temp6 += vx1* va6_1 + vx2 * va6_2; - temp7 += vx1* va7_1 + vx2 * va7_2; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 7; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 & 4) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp3 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 4 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; - y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; - y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; - aj += 16; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - aj += 4; - } - - } else if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; - y_ptr += inc_y; - aj += lda; - } - - } - if (m3==4) return (0); - a_ptr += 4; - } - - if (m3 & 2 ) { - - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - if (m3==2) return (0); - a_ptr += 2; - } - if (m3 & 1) { - - FLOAT xtemp = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - - } - a_ptr += 1; - } - return (0); - -} - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could be used as base for switching to inline assembly +*/ + +#include "common.h" +#include +#define NBMAX 4096 + +#include + +static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i +=2) { + register __vector float vx1=v_x[i] ; + register __vector float vx2=v_x[i+1] ; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float va4_1=va4[i] ; + register __vector float va4_2=va4[i+1] ; + register __vector float va5_1=va5[i] ; + register __vector float va5_2=va5[i+1] ; + register __vector float va6_1=va6[i] ; + register __vector float va6_2=va6[i+1] ; + register __vector float va7_1=va7[i] ; + register __vector float va7_2=va7[i+1] ; + temp0 += vx1* va0_1 + vx2 * va0_2; + temp1 += vx1* va1_1 + vx2 * va1_2; + temp2 += vx1* va2_1 + vx2 * va2_2; + temp3 += vx1* va3_1 + vx2 * va3_2; + temp4 += vx1* va4_1 + vx2 * va4_2; + temp5 += vx1* va5_1 + vx2 * va5_2; + temp6 += vx1* va6_1 + vx2 * va6_2; + temp7 += vx1* va7_1 + vx2 * va7_2; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + } + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 7; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 & 4) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp3 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 4 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; + y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; + y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; + aj += 16; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + aj += 4; + } + + } else if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; + y_ptr += inc_y; + aj += lda; + } + + } + if (m3==4) return (0); + a_ptr += 4; + } + + if (m3 & 2 ) { + + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + if (m3==2) return (0); + a_ptr += 2; + } + if (m3 & 1) { + + FLOAT xtemp = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + + } + a_ptr += 1; + } + return (0); + +} + diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index d1e60da6c..f9320d516 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -1,245 +1,245 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - -#define LOAD ld - -#define STACKSIZE 512 - -#define FZERO 312+192(SP) - -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ - -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - - -#define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 - -#define VECSAVE r11 - -#define FRAMEPOINTER r12 - -#define T10 r14 - -#define L r15 -#define T8 r16 -#define T5 r17 -#define T2 r19 -#define TEMP_REG r20 -#define T6 r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T7 r27 -#define T3 r28 -#define T4 r29 - -#define PRE r30 -#define T1 r31 - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - mflr r0 - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - xxspltd alpha_r,vs1,0 /*copy from register f1 */ - xxspltd alpha_i,vs2,0 /*copy from register f2 */ - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - std r0, FLINK_SAVE(SP) - - -#if defined(linux) || defined(__FreeBSD__) - ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) -#endif - - -#ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) -#endif -#endif - - -#include "zgemm_macros_power9.S" - - - - slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li r0, 0 - - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegdp alpha_r,alpha_r - xvnegdp alpha_i,alpha_i -#endif - .align 4 - -#include "zgemm_logic_power9.S" - -L999: - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power9.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power9.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE #endif \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index fe5d8ade2..850b41aff 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -1,1891 +1,1891 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define MY_ALIGN .align 3 -b ZGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -ZGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 -ZGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_L2 256,64,31,0 - KERNEL2x8_L2 256,64,32,0 - KERNEL2x8_L2 256,64,33,0 - KERNEL2x8_L2 256,64,34,0 - KERNEL2x8_L2 256,64,35,0 - KERNEL2x8_L2 256,64,36,0 - KERNEL2x8_L2 256,64,37,0 - KERNEL2x8_L2 256,64,38,0 - KERNEL2x8_L2 256,64,39,0 - KERNEL2x8_L2 256,64,40,0 - KERNEL2x8_L2 256,64,41,0 - KERNEL2x8_L2 256,64,42,0 - KERNEL2x8_L2 256,64,43,0 - KERNEL2x8_L2 256,64,44,0 - KERNEL2x8_L2 256,64,45,0 - KERNEL2x8_L2 256,64,46,0 - KERNEL2x8_L2 256,64,47,0 - KERNEL2x8_L2 256,64,48,0 - KERNEL2x8_L2 256,64,49,0 - KERNEL2x8_L2 256,64,50,0 - KERNEL2x8_L2 256,64,51,0 - KERNEL2x8_L2 256,64,52,0 - KERNEL2x8_L2 256,64,53,0 - KERNEL2x8_L2 256,64,54,0 - KERNEL2x8_L2 256,64,55,0 - KERNEL2x8_L2 256,64,56,0 - KERNEL2x8_L2 256,64,57,0 - KERNEL2x8_L2 256,64,58,0 - KERNEL2x8_L2 256,64,59,0 - KERNEL2x8_L2 256,64,60,0 - KERNEL2x8_L2 256,64,61,0 - KERNEL2x8_L2 256,64,62,0 - KERNEL2x8_L2 256,64,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN -ZGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -ZGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_E2 256,64,31,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_E2 256,64,15,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_E2 256,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -ZGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,0,0 -ZGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_L2 128,64,7,0 - KERNEL2x4_L2 128,64,8,0 - KERNEL2x4_L2 128,64,9,0 - KERNEL2x4_L2 128,64,10,0 - KERNEL2x4_L2 128,64,11,0 - KERNEL2x4_L2 128,64,12,0 - KERNEL2x4_L2 128,64,13,0 - KERNEL2x4_L2 128,64,14,0 - KERNEL2x4_L2 128,64,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -ZGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_E2 128,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_E2 128,64,3,1 - blr - - -ZGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -ZGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,0,0 -ZGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_L2 64,64,7,0 - KERNEL2x2_L2 64,64,8,0 - KERNEL2x2_L2 64,64,9,0 - KERNEL2x2_L2 64,64,10,0 - KERNEL2x2_L2 64,64,11,0 - KERNEL2x2_L2 64,64,12,0 - KERNEL2x2_L2 64,64,13,0 - KERNEL2x2_L2 64,64,14,0 - KERNEL2x2_L2 64,64,15,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN - - -ZGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -ZGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_E2 64,64,7,1 - blr - MY_ALIGN -ZGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_E2 64,64,3,1 - blr - - -ZGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -ZGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,0,0 -ZGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_L2 32,64,7,0 - KERNEL2x1_L2 32,64,8,0 - KERNEL2x1_L2 32,64,9,0 - KERNEL2x1_L2 32,64,10,0 - KERNEL2x1_L2 32,64,11,0 - KERNEL2x1_L2 32,64,12,0 - KERNEL2x1_L2 32,64,13,0 - KERNEL2x1_L2 32,64,14,0 - KERNEL2x1_L2 32,64,15,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -ZGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_E2 32,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_E2 32,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -ZGEMM_L2: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 1 - ble ZGEMM_L2_END - - -ZGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - - -ZGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8O 128,32 - END2x8_WITHOUT_ADD - LOAD2x8_2O 256, 64 - mtctr T8 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-256 - LOAD2x8_2O 256,64 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - MY_ALIGN - - -ZGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_L2 256,64, 1,0 - KERNEL2x8_L2 256,64, 2,0 - KERNEL2x8_E2 256,64, 3,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_E2 256,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 256,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 - - -ZGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt ZGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN - - -ZGEMM_L2x8_END: -/*----------------------------------------*/ - - -ZGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble ZGEMM_L2x4_SUB0 - bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 - - -ZGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4O 64,32 - END2x4_WITHOUT_ADD - LOAD2x4_2O 128, 64 - mtctr T8 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD2x4_2O 128,64 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x4_SUB2_4 - bl ZGEMM_2x4_L8_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 128,64, 0,0 - KERNEL2x4_E2 128,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 128,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 - - -ZGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -ZGEMM_L2x4_END: -/*----------------------------------------*/ - - -ZGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble ZGEMM_L2x2_SUB0 - bl ZGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x2_SAVE - b ZGEMM_L2x2_SUB2 - - -ZGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2O 32,32 - END2x2_WITHOUT_ADD - LOAD2x2_2O 64, 64 - mtctr T8 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD2x2_2O 64,64 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x2_SUB2_8 - bl ZGEMM_2x2_L16_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x2_SUB2_4 - bl ZGEMM_2x2_L8_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 64,64, 0,0 - KERNEL2x2_E2 64,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 64,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 - - -ZGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -ZGEMM_L2x2_END: -/*----------------------------------------*/ - - -ZGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble ZGEMM_L2x1_SUB0 - bl ZGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x1_SAVE - b ZGEMM_L2x1_SUB2 - - -ZGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1O 16,32 - END2x1_WITHOUT_ADD - LOAD2x1_2O 32, 64 - mtctr T8 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD2x1_2O 32,64 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x1_SUB2_8 - bl ZGEMM_2x1_L16_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x1_SUB2_4 - bl ZGEMM_2x1_L8_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 32,64, 0,0 - KERNEL2x1_E2 32,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 32,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 - - -ZGEMM_L2x1_SAVE: -/*----------------------------------------*/ - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -ZGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - bgt ZGEMM_L2_BEGIN - - -ZGEMM_L2_END: - -b ZGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -ZGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 -ZGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_L2 256,32,31,0 - KERNEL1x8_L2 256,32,32,0 - KERNEL1x8_L2 256,32,33,0 - KERNEL1x8_L2 256,32,34,0 - KERNEL1x8_L2 256,32,35,0 - KERNEL1x8_L2 256,32,36,0 - KERNEL1x8_L2 256,32,37,0 - KERNEL1x8_L2 256,32,38,0 - KERNEL1x8_L2 256,32,39,0 - KERNEL1x8_L2 256,32,40,0 - KERNEL1x8_L2 256,32,41,0 - KERNEL1x8_L2 256,32,42,0 - KERNEL1x8_L2 256,32,43,0 - KERNEL1x8_L2 256,32,44,0 - KERNEL1x8_L2 256,32,45,0 - KERNEL1x8_L2 256,32,46,0 - KERNEL1x8_L2 256,32,47,0 - KERNEL1x8_L2 256,32,48,0 - KERNEL1x8_L2 256,32,49,0 - KERNEL1x8_L2 256,32,50,0 - KERNEL1x8_L2 256,32,51,0 - KERNEL1x8_L2 256,32,52,0 - KERNEL1x8_L2 256,32,53,0 - KERNEL1x8_L2 256,32,54,0 - KERNEL1x8_L2 256,32,55,0 - KERNEL1x8_L2 256,32,56,0 - KERNEL1x8_L2 256,32,57,0 - KERNEL1x8_L2 256,32,58,0 - KERNEL1x8_L2 256,32,59,0 - KERNEL1x8_L2 256,32,60,0 - KERNEL1x8_L2 256,32,61,0 - KERNEL1x8_L2 256,32,62,0 - KERNEL1x8_L2 256,32,63,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -ZGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_E2 256,32,31,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_E2 256,32,15,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_E2 256,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN - - -ZGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,0,0 - - -ZGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_L2 128,32,7,0 - KERNEL1x4_L2 128,32,8,0 - KERNEL1x4_L2 128,32,9,0 - KERNEL1x4_L2 128,32,10,0 - KERNEL1x4_L2 128,32,11,0 - KERNEL1x4_L2 128,32,12,0 - KERNEL1x4_L2 128,32,13,0 - KERNEL1x4_L2 128,32,14,0 - KERNEL1x4_L2 128,32,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN - - -ZGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -ZGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_E2 128,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_E2 128,32,3,1 - blr - - -ZGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN - - -ZGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,0,0 - - -ZGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_L2 64,32,7,0 - KERNEL1x2_L2 64,32,8,0 - KERNEL1x2_L2 64,32,9,0 - KERNEL1x2_L2 64,32,10,0 - KERNEL1x2_L2 64,32,11,0 - KERNEL1x2_L2 64,32,12,0 - KERNEL1x2_L2 64,32,13,0 - KERNEL1x2_L2 64,32,14,0 - KERNEL1x2_L2 64,32,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN - - -ZGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN - - -ZGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_E2 64,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_E2 64,32,3,1 - blr - - -ZGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN - - -ZGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,0,0 - - -ZGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_L2 32,32,7,0 - KERNEL1x1_L2 32,32,8,0 - KERNEL1x1_L2 32,32,9,0 - KERNEL1x1_L2 32,32,10,0 - KERNEL1x1_L2 32,32,11,0 - KERNEL1x1_L2 32,32,12,0 - KERNEL1x1_L2 32,32,13,0 - KERNEL1x1_L2 32,32,14,0 - KERNEL1x1_L2 32,32,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN - - -ZGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - MY_ALIGN - - -ZGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_E2 32,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_E2 32,32,3,1 - blr - - -/*----------------------N1 BEGINS---------*/ -ZGEMM_L1: -/*----------------------------------------*/ - andi. T1, N, 1 - ble ZGEMM_L1_END - -ZGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - bl ZGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L1x8_SAVE - b ZGEMM_L1x8_SUB2 - - -ZGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8O 128,16 - END1x8_WITHOUT_ADD - LOAD1x8_2O 256, 32 - mtctr T8 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-256 - LOAD1x8_2O 256,32 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - MY_ALIGN - - -ZGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L1x8_SUB2_32 - bl ZGEMM_1x8_L64_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L1x8_SUB2_16 - bl ZGEMM_1x8_L32_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x8_SUB2_8 - bl ZGEMM_1x8_L16_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_L2 256,32, 1,0 - KERNEL1x8_L2 256,32, 2,0 - KERNEL1x8_E2 256,32, 3,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_E2 256,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 256,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - - -ZGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt ZGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END - b ZGEMM_L1x4_BEGIN - MY_ALIGN - - -ZGEMM_L1x8_END: -/*----------------------------------------*/ - - -ZGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - bl ZGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE - b ZGEMM_L1x4_SUB2 - - -ZGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4O 64,16 - END1x4_WITHOUT_ADD - LOAD1x4_2O 128, 32 - mtctr T8 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD1x4_2O 128,32 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x4_SUB2_8 - bl ZGEMM_1x4_L16_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x4_SUB2_4 - bl ZGEMM_1x4_L8_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 128,32, 0,0 - KERNEL1x4_E2 128,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 128,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 - - -ZGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -ZGEMM_L1x4_END: -/*----------------------------------------*/ - - -ZGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - bl ZGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE - b ZGEMM_L1x2_SUB2 - - -ZGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2O 32,16 - END1x2_WITHOUT_ADD - LOAD1x2_2O 64, 32 - mtctr T8 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD1x2_2O 64,32 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x2_SUB2_8 - bl ZGEMM_1x2_L16_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x2_SUB2_4 - bl ZGEMM_1x2_L8_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 64,32, 0,0 - KERNEL1x2_E2 64,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 64,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 - - -ZGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -ZGEMM_L1x2_END: -/*----------------------------------------*/ - - -ZGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - bl ZGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE - b ZGEMM_L1x1_SUB2 - - -ZGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1O 16,16 - END1x1_WITHOUT_ADD - LOAD1x1_2O 32, 32 - mtctr T8 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD1x1_2O 32,32 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x1_SUB2_8 - bl ZGEMM_1x1_L16_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x1_SUB2_4 - bl ZGEMM_1x1_L8_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 32,32, 0,0 - KERNEL1x1_E2 32,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 32,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 - - -ZGEMM_L1x1_SAVE: -/*----------------------------------------*/ - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -ZGEMM_L1x1_END: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - - -ZGEMM_L1_END: -/*----------------------------------------*/ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_L2 256,64,31,0 + KERNEL2x8_L2 256,64,32,0 + KERNEL2x8_L2 256,64,33,0 + KERNEL2x8_L2 256,64,34,0 + KERNEL2x8_L2 256,64,35,0 + KERNEL2x8_L2 256,64,36,0 + KERNEL2x8_L2 256,64,37,0 + KERNEL2x8_L2 256,64,38,0 + KERNEL2x8_L2 256,64,39,0 + KERNEL2x8_L2 256,64,40,0 + KERNEL2x8_L2 256,64,41,0 + KERNEL2x8_L2 256,64,42,0 + KERNEL2x8_L2 256,64,43,0 + KERNEL2x8_L2 256,64,44,0 + KERNEL2x8_L2 256,64,45,0 + KERNEL2x8_L2 256,64,46,0 + KERNEL2x8_L2 256,64,47,0 + KERNEL2x8_L2 256,64,48,0 + KERNEL2x8_L2 256,64,49,0 + KERNEL2x8_L2 256,64,50,0 + KERNEL2x8_L2 256,64,51,0 + KERNEL2x8_L2 256,64,52,0 + KERNEL2x8_L2 256,64,53,0 + KERNEL2x8_L2 256,64,54,0 + KERNEL2x8_L2 256,64,55,0 + KERNEL2x8_L2 256,64,56,0 + KERNEL2x8_L2 256,64,57,0 + KERNEL2x8_L2 256,64,58,0 + KERNEL2x8_L2 256,64,59,0 + KERNEL2x8_L2 256,64,60,0 + KERNEL2x8_L2 256,64,61,0 + KERNEL2x8_L2 256,64,62,0 + KERNEL2x8_L2 256,64,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +ZGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_E2 256,64,31,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_E2 256,64,15,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_E2 256,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,0,0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_L2 128,64,7,0 + KERNEL2x4_L2 128,64,8,0 + KERNEL2x4_L2 128,64,9,0 + KERNEL2x4_L2 128,64,10,0 + KERNEL2x4_L2 128,64,11,0 + KERNEL2x4_L2 128,64,12,0 + KERNEL2x4_L2 128,64,13,0 + KERNEL2x4_L2 128,64,14,0 + KERNEL2x4_L2 128,64,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +ZGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_E2 128,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_E2 128,64,3,1 + blr + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,0,0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_L2 64,64,7,0 + KERNEL2x2_L2 64,64,8,0 + KERNEL2x2_L2 64,64,9,0 + KERNEL2x2_L2 64,64,10,0 + KERNEL2x2_L2 64,64,11,0 + KERNEL2x2_L2 64,64,12,0 + KERNEL2x2_L2 64,64,13,0 + KERNEL2x2_L2 64,64,14,0 + KERNEL2x2_L2 64,64,15,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +ZGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_E2 64,64,7,1 + blr + MY_ALIGN +ZGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_E2 64,64,3,1 + blr + + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,0,0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_L2 32,64,7,0 + KERNEL2x1_L2 32,64,8,0 + KERNEL2x1_L2 32,64,9,0 + KERNEL2x1_L2 32,64,10,0 + KERNEL2x1_L2 32,64,11,0 + KERNEL2x1_L2 32,64,12,0 + KERNEL2x1_L2 32,64,13,0 + KERNEL2x1_L2 32,64,14,0 + KERNEL2x1_L2 32,64,15,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +ZGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_E2 32,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_E2 32,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + ble ZGEMM_L2_END + + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8O 128,32 + END2x8_WITHOUT_ADD + LOAD2x8_2O 256, 64 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-256 + LOAD2x8_2O 256,64 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + bl ZGEMM_2x8_L16_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_L2 256,64, 1,0 + KERNEL2x8_L2 256,64, 2,0 + KERNEL2x8_E2 256,64, 3,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_E2 256,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 256,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4O 64,32 + END2x4_WITHOUT_ADD + LOAD2x4_2O 128, 64 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD2x4_2O 128,64 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + bl ZGEMM_2x4_L16_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + bl ZGEMM_2x4_L8_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 128,64, 0,0 + KERNEL2x4_E2 128,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 128,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2O 32,32 + END2x2_WITHOUT_ADD + LOAD2x2_2O 64, 64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD2x2_2O 64,64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + bl ZGEMM_2x2_L16_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + bl ZGEMM_2x2_L8_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 64,64, 0,0 + KERNEL2x2_E2 64,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 64,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1O 16,32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD2x1_2O 32,64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + bl ZGEMM_2x1_L16_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + bl ZGEMM_2x1_L8_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32,64, 0,0 + KERNEL2x1_E2 32,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + bgt ZGEMM_L2_BEGIN + + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_L2 256,32,31,0 + KERNEL1x8_L2 256,32,32,0 + KERNEL1x8_L2 256,32,33,0 + KERNEL1x8_L2 256,32,34,0 + KERNEL1x8_L2 256,32,35,0 + KERNEL1x8_L2 256,32,36,0 + KERNEL1x8_L2 256,32,37,0 + KERNEL1x8_L2 256,32,38,0 + KERNEL1x8_L2 256,32,39,0 + KERNEL1x8_L2 256,32,40,0 + KERNEL1x8_L2 256,32,41,0 + KERNEL1x8_L2 256,32,42,0 + KERNEL1x8_L2 256,32,43,0 + KERNEL1x8_L2 256,32,44,0 + KERNEL1x8_L2 256,32,45,0 + KERNEL1x8_L2 256,32,46,0 + KERNEL1x8_L2 256,32,47,0 + KERNEL1x8_L2 256,32,48,0 + KERNEL1x8_L2 256,32,49,0 + KERNEL1x8_L2 256,32,50,0 + KERNEL1x8_L2 256,32,51,0 + KERNEL1x8_L2 256,32,52,0 + KERNEL1x8_L2 256,32,53,0 + KERNEL1x8_L2 256,32,54,0 + KERNEL1x8_L2 256,32,55,0 + KERNEL1x8_L2 256,32,56,0 + KERNEL1x8_L2 256,32,57,0 + KERNEL1x8_L2 256,32,58,0 + KERNEL1x8_L2 256,32,59,0 + KERNEL1x8_L2 256,32,60,0 + KERNEL1x8_L2 256,32,61,0 + KERNEL1x8_L2 256,32,62,0 + KERNEL1x8_L2 256,32,63,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +ZGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_E2 256,32,31,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_E2 256,32,15,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_E2 256,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,0,0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_L2 128,32,7,0 + KERNEL1x4_L2 128,32,8,0 + KERNEL1x4_L2 128,32,9,0 + KERNEL1x4_L2 128,32,10,0 + KERNEL1x4_L2 128,32,11,0 + KERNEL1x4_L2 128,32,12,0 + KERNEL1x4_L2 128,32,13,0 + KERNEL1x4_L2 128,32,14,0 + KERNEL1x4_L2 128,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +ZGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_E2 128,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_E2 128,32,3,1 + blr + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,0,0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_L2 64,32,7,0 + KERNEL1x2_L2 64,32,8,0 + KERNEL1x2_L2 64,32,9,0 + KERNEL1x2_L2 64,32,10,0 + KERNEL1x2_L2 64,32,11,0 + KERNEL1x2_L2 64,32,12,0 + KERNEL1x2_L2 64,32,13,0 + KERNEL1x2_L2 64,32,14,0 + KERNEL1x2_L2 64,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN + + +ZGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_E2 64,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_E2 64,32,3,1 + blr + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,0,0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_L2 32,32,7,0 + KERNEL1x1_L2 32,32,8,0 + KERNEL1x1_L2 32,32,9,0 + KERNEL1x1_L2 32,32,10,0 + KERNEL1x1_L2 32,32,11,0 + KERNEL1x1_L2 32,32,12,0 + KERNEL1x1_L2 32,32,13,0 + KERNEL1x1_L2 32,32,14,0 + KERNEL1x1_L2 32,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +ZGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_E2 32,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_E2 32,32,3,1 + blr + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8O 128,16 + END1x8_WITHOUT_ADD + LOAD1x8_2O 256, 32 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-256 + LOAD1x8_2O 256,32 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + bl ZGEMM_1x8_L64_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + bl ZGEMM_1x8_L32_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + bl ZGEMM_1x8_L16_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_L2 256,32, 1,0 + KERNEL1x8_L2 256,32, 2,0 + KERNEL1x8_E2 256,32, 3,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_E2 256,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 256,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4O 64,16 + END1x4_WITHOUT_ADD + LOAD1x4_2O 128, 32 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD1x4_2O 128,32 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + bl ZGEMM_1x4_L16_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + bl ZGEMM_1x4_L8_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 128,32, 0,0 + KERNEL1x4_E2 128,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 128,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2O 32,16 + END1x2_WITHOUT_ADD + LOAD1x2_2O 64, 32 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD1x2_2O 64,32 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + bl ZGEMM_1x2_L16_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + bl ZGEMM_1x2_L8_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 64,32, 0,0 + KERNEL1x2_E2 64,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 64,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1O 16,16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD1x1_2O 32,32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x1_SUB2_8 + bl ZGEMM_1x1_L16_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x1_SUB2_4 + bl ZGEMM_1x1_L8_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32,32, 0,0 + KERNEL1x1_E2 32,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 8670e9574..68024b826 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -1,1825 +1,1825 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 16 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) -/* HELPERS FOR SAVE */ -/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ - - -.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET -#ifndef TRMMKERNEL - lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) - lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) - xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 - xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 -#endif -.endm -/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ - - -.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ -.endm -/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ - - -.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ -.endm -/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ - - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead instead to fix sign*/ - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm -/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ - - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 -#ifndef TRMMKERNEL - xvmsubadp \VSOUT1,\VSINII, alpha_i - xvmaddadp \VSOUT2,\VSINRR, alpha_i -#else - xvmuldp \VSOUT1,\VSINII, alpha_i - xvmuldp \VSOUT2,\VSINRR, alpha_i -#endif -.endm -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubadp \VSOUT1,\VSINRR, alpha_r - xvmaddadp \VSOUT2,\VSINII, alpha_r -.endm -/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ - - -.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrghd \VSOUT1,\VSIN2,\VSIN1 - xxmrgld \VSOUT2,\VSIN2,\VSIN1 -.endm - - -.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 - stxv \VSIN1, DISPX(\LOFFSET)(\REG) - stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) -.endm - - -.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 - MULT_APLHA_PART1 vs6,vs8,vs16,vs17 - MULT_APLHA_PART2 vs2,vs4,vs14,vs15 - AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - MULT_APLHA_PART1 vs10,vs12, vs24,vs25 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - MULT_APLHA_PART2 vs10,vs12,vs24,vs25 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 - MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 - UNPACK_FOR_STORE vs24,vs25,vs10,vs12 - UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 - STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 - STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 -.endm - - -.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART1 vs6,vs8, vs16,vs17 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 -.endm - - - -.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 -.endm - - - -.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 -#ifndef TRMMKERNEL - lxv vs18, (\LOFFSET)(\BASE_REG) - xxmrgld vs14,vs18,vs18 - xxmrghd vs15,vs18,vs18 -#endif - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - xxmrghd vs7,vs15,vs14 - stxv vs7, (\LOFFSET)(\BASE_REG) -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,128,32 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x8_2 - /*for load2 offset will be 256 and 64*/ - KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 -.endm - - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs48, vs8, vs22 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs49, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs50, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs51, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs52, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs54, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs55, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs56, vs12, vs22 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs58, vs13, vs22 - xvmaddadp vs43, vs13, vs21 - xvmaddadp vs59, vs13, vs23 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs60, vs14, vs22 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs62, vs15, vs22 - xvmaddadp vs47, vs15, vs21 - xvmaddadp vs63, vs15, vs23 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 128,32 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,64,32 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 - -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_2 - /*for load2 offset will be 128 and 64*/ - KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs41, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs43, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs47, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 64,32 -.endm - - - -.macro SAVE2x4 - add T1, CO ,LDC - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,32,32 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 - -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_2 - /*for load2 offset will be 64 and 64*/ - KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 -.endm - - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs36, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs37, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs38, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs39, vs9, vs23 -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 32,32 -.endm - - - -.macro SAVE2x2 - add T1, CO ,LDC - SAVE2 vs32,vs33,vs34,vs35,CO,0 - SAVE2 vs36,vs37,vs38,vs39,T1,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,16,32 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_2 - /*for load2 offset will be 32 and 64*/ - KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 -.endm - - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs34, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs35, vs8, vs23 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 16,32 -.endm - - - -.macro SAVE2x1 - add T1, CO ,LDC - SAVE1 vs32,vs33,CO,0 - SAVE1 vs34,vs35,T1,0 - addi CO, CO, 16 -.endm - -/********************************************************************************************** -* - -.macros for N=1 and M=8 -**********************************************************************************************/ - - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,128,16 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 - - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 - -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x8_2 - /*for load2 offset will be 256 and 32*/ - KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 -.endm - - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs43, vs13, vs21 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs47, vs15, vs21 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 128,16 -.endm - - -.macro SAVE1x8 - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,64,16 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x4_2 - /*for load2 offset will be 128 and 32*/ - KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 -.endm - - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 64,16 -.endm - - - -.macro SAVE1x4 - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,32,16 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x2_2 - /*for load2 offset will be 64 and 32*/ - KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 -.endm - - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 32,16 -.endm - - - -.macro SAVE1x2 - SAVE2 vs32,vs33,vs34,vs35,CO,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - xxswapd vs17, vs16 - -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,16,16 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x1_2 - /*for load2 offset will be 32 and 32*/ - KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 -.endm - - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 16,16 -.endm - - - -.macro SAVE1x1 - SAVE1 vs32,vs33,CO,0 - addi CO, CO, 16 -.endm - -/****************************TRMM POINTER REFRESH - -.macroSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 8 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 4 - .endif -.endm -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ - - -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ - - -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - #endif - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs6,vs8,vs16,vs17 + MULT_APLHA_PART2 vs2,vs4,vs14,vs15 + AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + MULT_APLHA_PART1 vs10,vs12, vs24,vs25 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + MULT_APLHA_PART2 vs10,vs12,vs24,vs25 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 + UNPACK_FOR_STORE vs24,vs25,vs10,vs12 + UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART1 vs6,vs8, vs16,vs17 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 +.endm + + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 +.endm + + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 +#ifndef TRMMKERNEL + lxv vs18, (\LOFFSET)(\BASE_REG) + xxmrgld vs14,vs18,vs18 + xxmrghd vs15,vs18,vs18 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + xxmrghd vs7,vs15,vs14 + stxv vs7, (\LOFFSET)(\BASE_REG) +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,128,32 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x8_2 + /*for load2 offset will be 256 and 64*/ + KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 +.endm + + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs48, vs8, vs22 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs49, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs50, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs51, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs52, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs53, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs54, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs55, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs56, vs12, vs22 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs57, vs12, vs23 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs58, vs13, vs22 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs59, vs13, vs23 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs60, vs14, vs22 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs61, vs14, vs23 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs62, vs15, vs22 + xvmaddadp vs47, vs15, vs21 + xvmaddadp vs63, vs15, vs23 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 128,32 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,64,32 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 + +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_2 + /*for load2 offset will be 128 and 64*/ + KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs41, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs43, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs47, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 64,32 +.endm + + + +.macro SAVE2x4 + add T1, CO ,LDC + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,32,32 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 + +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_2 + /*for load2 offset will be 64 and 64*/ + KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 +.endm + + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs37, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs39, vs9, vs23 +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 32,32 +.endm + + + +.macro SAVE2x2 + add T1, CO ,LDC + SAVE2 vs32,vs33,vs34,vs35,CO,0 + SAVE2 vs36,vs37,vs38,vs39,T1,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,16,32 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs35, vs8, vs23 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs32,vs33,CO,0 + SAVE1 vs34,vs35,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,128,16 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x8_2 + /*for load2 offset will be 256 and 32*/ + KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 +.endm + + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs43, vs13, vs21 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs47, vs15, vs21 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 128,16 +.endm + + +.macro SAVE1x8 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,64,16 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x4_2 + /*for load2 offset will be 128 and 32*/ + KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 +.endm + + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 64,16 +.endm + + + +.macro SAVE1x4 + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,32,16 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x2_2 + /*for load2 offset will be 64 and 32*/ + KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 +.endm + + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 32,16 +.endm + + + +.macro SAVE1x2 + SAVE2 vs32,vs33,vs34,vs35,CO,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs17, vs16 + +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,16,16 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs32,vs33,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 8bcd31ef2..522c6d7d9 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -62,6 +62,8 @@ gotoblas_t TABLE_NAME = { MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), #endif + SBGEMM_ALIGN_K, + sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, samax_kTS, samin_kTS, smax_kTS, smin_kTS, @@ -866,7 +868,7 @@ gotoblas_t TABLE_NAME = { cgeadd_kTS, #endif #if BUILD_COMPLEX16==1 - zgeadd_kTS + zgeadd_kTS, #endif }; diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index cb6f62981..548e5dcfc 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -44,8 +44,5 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c -CSCALKERNEL = ../arm/zscal.c -ZSCALKERNEL = ../arm/zscal.c - CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c diff --git a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S index 97958a88f..2675f71fb 100644 --- a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S +++ b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S @@ -1,1897 +1,1897 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddps -#define VFMADD_I vfmaddps -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddps -#define VFMADD_I vfmaddps -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddps -#define VFMADD_I vfnmaddps -#else -#define VFMADD_R vfnmaddps -#define VFMADD_I vfnmaddps -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - - - -/************************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $8, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - vshufps $0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_20: - testq $3, M - jz .L2_60 // to next 2 lines of N - - testq $2, M - jz .L2_40 - ALIGN_4 - -.L2_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL2x2_SUB(xxx) - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - - vmovsd %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_20: - testq $3, M - jz .L999 - - testq $2, M - jz .L1_40 - ALIGN_4 - -.L1_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL2x1_SUB(xxx) - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_40: - testq $1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S index 72deee12f..bf7f91ee9 100644 --- a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S +++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S @@ -1,1921 +1,1921 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -/********************************************************************* -* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/31 Saar -* -* Parameter: -* UNROLL_M 4 -* UNROLL_N 2 -* CGEMM_P 768 -* CGEMM_Q 168 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) -* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) -* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) -* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) -* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) -* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) -* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) -* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) -* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 256*8*4 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddps -#define VFMADD_I vfmaddps -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddps -#define VFMADD_I vfmaddps -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddps -#define VFMADD_I vfnmaddps -#else -#define VFMADD_R vfnmaddps -#define VFMADD_I vfnmaddps -#endif - - - -#define A_PR1 512 -#define B_PR1 256 - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - - - -/************************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $8, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - vshufps $0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_20: - testq $3, M - jz .L2_60 // to next 2 lines of N - - testq $2, M - jz .L2_40 - ALIGN_4 - -.L2_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL2x2_SUB(xxx) - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - - vmovsd %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_20: - testq $3, M - jz .L999 - - testq $2, M - jz .L1_40 - ALIGN_4 - -.L1_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL2x1_SUB(xxx) - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_40: - testq $1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +/********************************************************************* +* +* 2014/06/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 4 +* UNROLL_N 2 +* CGEMM_P 768 +* CGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) +* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) +* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) +* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) +* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) +* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) +* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) +* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) +* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 256*8*4 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemm_kernel_8x2_sandy.S b/kernel/x86_64/cgemm_kernel_8x2_sandy.S index c85646d43..988913591 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_sandy.S +++ b/kernel/x86_64/cgemm_kernel_8x2_sandy.S @@ -1,2353 +1,2353 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/07/29 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* CGEMM_DEFAULT_UNROLL_N 2 -* CGEMM_DEFAULT_UNROLL_M 8 -* CGEMM_DEFAULT_P 768 -* CGEMM_DEFAULT_Q 512 -* A_PR1 512 -* B_PR1 512 -* -* 2014/07/29 Saar -* Performance at 6192x6192x6192: -* 1 thread: 49 GFLOPS (MKL: 52) -* 2 threads: 99 GFLOPS (MKL: 102) -* 3 threads: 148 GFLOPS (MKL: 150) -* 4 threads: 195 GFLOPS (MKL: 194) -* 8 threads: 354 GFLOPS (MKL: 317) -* -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vaddps y0,%ymm2,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vaddps y0,%ymm3,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vaddps y0,%xmm2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vaddps y0,%xmm3,y0 - - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vsubps %ymm2,y0,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vaddps y0,%ymm3,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vsubps %xmm2,y0,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vaddps y0,%xmm3,y0 - - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vaddps y0,%ymm2,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vsubps %ymm3,y0,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vaddps y0,%xmm2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vsubps %xmm3,y0,y0 - - -#else - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vsubps %ymm2,y0,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vsubps %ymm3,y0,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vsubps %xmm2,y0,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vsubps %xmm3,y0,y0 - - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/***************************************************************************************************************************/ - -.macro KERNEL8x2_1 - - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - prefetcht0 A_PR1(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+64(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+128(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+192(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - addq $ 16, BI - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - addq $ 64, %rax -.endm - - -.macro KERNEL8x2_SUB - - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm10, %ymm11,%ymm11 - vaddsubps %ymm12, %ymm13,%ymm13 - vaddsubps %ymm14, %ymm15,%ymm15 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm11, %ymm10 - vmovaps %ymm13, %ymm12 - vmovaps %ymm15, %ymm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm10, %ymm0, %ymm10 - vmulps %ymm12, %ymm0, %ymm12 - vmulps %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm11, %ymm1, %ymm11 - vmulps %ymm13, %ymm1, %ymm13 - vmulps %ymm15, %ymm1, %ymm15 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - - vaddps (CO1, LDC), %ymm10, %ymm10 - vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 8 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - vmovsd %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm12, %ymm13,%ymm13 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm13, %ymm12 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm13, %ymm1, %ymm13 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L2_4_10 - - ALIGN_4 -/**********************************************************************************************************/ - -.L2_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - je .L2_8_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - je .L2_8_16 - - jmp .L2_8_12 - ALIGN_4 - -.L2_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_17: - - KERNEL8x2_SUB - - jl .L2_8_17 - ALIGN_4 - - -.L2_8_19: - - SAVE8x2 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_8_11 - ALIGN_4 - - -/**********************************************************************************************************/ - - - - -.L2_4_10: - testq $ 7, M - jz .L2_4_60 // to next 2 lines of N - - testq $ 4, M - jz .L2_4_20 - ALIGN_4 - - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_4_20: - - testq $ 2, M - jz .L2_4_40 - ALIGN_4 - -.L2_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - jmp .L2_4_22 - ALIGN_4 - -.L2_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_27: - - KERNEL2x2_SUB - - jl .L2_4_27 - ALIGN_4 - - -.L2_4_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_4_21 - ALIGN_4 - - - -/**************************************************************************/ -.L2_4_40: - testq $ 1, M - jz .L2_4_60 // to next 2 lines of N - - ALIGN_4 - -.L2_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - jmp .L2_4_42 - ALIGN_4 - -.L2_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_47: - - KERNEL1x2_SUB - - jl .L2_4_47 - ALIGN_4 - - -.L2_4_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_4_41 - ALIGN_4 - - - - -.L2_4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L1_4_10 - - ALIGN_4 - -/**************************************************************************************************/ - -.L1_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - jmp .L1_8_12 - ALIGN_4 - -.L1_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_17: - - KERNEL8x1_SUB - - jl .L1_8_17 - ALIGN_4 - - -.L1_8_19: - - SAVE8x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_8_11 - ALIGN_4 - - - -/**************************************************************************************************/ -.L1_4_10: - - testq $ 7, M - jz .L999 - - testq $ 4, M - jz .L1_4_20 - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_4_20: - - testq $ 2, M - jz .L1_4_40 - ALIGN_4 - -.L1_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - jmp .L1_4_22 - ALIGN_4 - -.L1_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_27: - - KERNEL2x1_SUB - - jl .L1_4_27 - ALIGN_4 - - -.L1_4_29: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_4_40: - testq $ 1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - jmp .L1_4_42 - ALIGN_4 - -.L1_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_47: - - KERNEL1x1_SUB - - jl .L1_4_47 - ALIGN_4 - - -.L1_4_49: - - SAVE1x1 - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* CGEMM_DEFAULT_UNROLL_N 2 +* CGEMM_DEFAULT_UNROLL_M 8 +* CGEMM_DEFAULT_P 768 +* CGEMM_DEFAULT_Q 512 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/29 Saar +* Performance at 6192x6192x6192: +* 1 thread: 49 GFLOPS (MKL: 52) +* 2 threads: 99 GFLOPS (MKL: 102) +* 3 threads: 148 GFLOPS (MKL: 150) +* 4 threads: 195 GFLOPS (MKL: 194) +* 8 threads: 354 GFLOPS (MKL: 317) +* +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vaddps y0,%ymm2,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vaddps y0,%ymm3,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vaddps y0,%xmm2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vaddps y0,%xmm3,y0 + + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vsubps %ymm2,y0,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vaddps y0,%ymm3,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vsubps %xmm2,y0,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vaddps y0,%xmm3,y0 + + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vaddps y0,%ymm2,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vsubps %ymm3,y0,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vaddps y0,%xmm2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vsubps %xmm3,y0,y0 + + +#else + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vsubps %ymm2,y0,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vsubps %ymm3,y0,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vsubps %xmm2,y0,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vsubps %xmm3,y0,y0 + + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/***************************************************************************************************************************/ + +.macro KERNEL8x2_1 + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + prefetcht0 A_PR1(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+64(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+128(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+192(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + addq $ 16, BI + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + addq $ 64, %rax +.endm + + +.macro KERNEL8x2_SUB + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm10, %ymm11,%ymm11 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm14, %ymm15,%ymm15 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm11, %ymm10 + vmovaps %ymm13, %ymm12 + vmovaps %ymm15, %ymm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm10, %ymm0, %ymm10 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm11, %ymm1, %ymm11 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm15, %ymm1, %ymm15 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + + vaddps (CO1, LDC), %ymm10, %ymm10 + vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 8 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + vmovsd %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm12, %ymm13,%ymm13 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm13, %ymm12 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm13, %ymm1, %ymm13 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L2_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L2_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + je .L2_8_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + je .L2_8_16 + + jmp .L2_8_12 + ALIGN_4 + +.L2_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_17: + + KERNEL8x2_SUB + + jl .L2_8_17 + ALIGN_4 + + +.L2_8_19: + + SAVE8x2 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + + + +.L2_4_10: + testq $ 7, M + jz .L2_4_60 // to next 2 lines of N + + testq $ 4, M + jz .L2_4_20 + ALIGN_4 + + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_4_20: + + testq $ 2, M + jz .L2_4_40 + ALIGN_4 + +.L2_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + jmp .L2_4_22 + ALIGN_4 + +.L2_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_27: + + KERNEL2x2_SUB + + jl .L2_4_27 + ALIGN_4 + + +.L2_4_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L2_4_40: + testq $ 1, M + jz .L2_4_60 // to next 2 lines of N + + ALIGN_4 + +.L2_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + jmp .L2_4_42 + ALIGN_4 + +.L2_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_47: + + KERNEL1x2_SUB + + jl .L2_4_47 + ALIGN_4 + + +.L2_4_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_4_41 + ALIGN_4 + + + + +.L2_4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L1_4_10 + + ALIGN_4 + +/**************************************************************************************************/ + +.L1_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + jmp .L1_8_12 + ALIGN_4 + +.L1_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_17: + + KERNEL8x1_SUB + + jl .L1_8_17 + ALIGN_4 + + +.L1_8_19: + + SAVE8x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_8_11 + ALIGN_4 + + + +/**************************************************************************************************/ +.L1_4_10: + + testq $ 7, M + jz .L999 + + testq $ 4, M + jz .L1_4_20 + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_4_20: + + testq $ 2, M + jz .L1_4_40 + ALIGN_4 + +.L1_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + jmp .L1_4_22 + ALIGN_4 + +.L1_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_27: + + KERNEL2x1_SUB + + jl .L1_4_27 + ALIGN_4 + + +.L1_4_29: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_4_40: + testq $ 1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + jmp .L1_4_42 + ALIGN_4 + +.L1_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_47: + + KERNEL1x1_SUB + + jl .L1_4_47 + ALIGN_4 + + +.L1_4_49: + + SAVE1x1 + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index dc3f688c6..95a99b8b9 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -25,10 +25,25 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "cscal_microk_skylakex-2.c" +#elif defined(HASWELL) || defined(ZEN) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c new file mode 100644 index 000000000..8a622427b --- /dev/null +++ b/kernel/x86_64/cscal_microk_skylakex-2.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + /* _mm512_addsub_ps does not exist so we flip signs for odd elements of da_i */ + __m512 da_r = _mm512_set1_ps(alpha[0]); + __m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1); + for (; i < n2; i += 32) { + __m512 x0 = _mm512_loadu_ps(&x[i + 0]); + __m512 x1 = _mm512_loadu_ps(&x[i + 16]); + __m512 y0 = _mm512_permute_ps(x0, 0xb1); + __m512 y1 = _mm512_permute_ps(x1, 0xb1); + _mm512_storeu_ps(&x[i + 0], _mm512_add_ps(da_r * x0, da_i * y0)); + _mm512_storeu_ps(&x[i + 16], _mm512_add_ps(da_r * x1, da_i * y1)); + } +#else + __m256 da_r = _mm256_set1_ps(alpha[0]); + __m256 da_i = _mm256_set1_ps(alpha[1]); + for (; i < n2; i += 32) { + __m256 x0 = _mm256_loadu_ps(&x[i + 0]); + __m256 x1 = _mm256_loadu_ps(&x[i + 8]); + __m256 x2 = _mm256_loadu_ps(&x[i + 16]); + __m256 x3 = _mm256_loadu_ps(&x[i + 24]); + __m256 y0 = _mm256_permute_ps(x0, 0xb1); + __m256 y1 = _mm256_permute_ps(x1, 0xb1); + __m256 y2 = _mm256_permute_ps(x2, 0xb1); + __m256 y3 = _mm256_permute_ps(x3, 0xb1); + _mm256_storeu_ps(&x[i + 0], _mm256_addsub_ps(da_r * x0, da_i * y0)); + _mm256_storeu_ps(&x[i + 8], _mm256_addsub_ps(da_r * x1, da_i * y1)); + _mm256_storeu_ps(&x[i + 16], _mm256_addsub_ps(da_r * x2, da_i * y2)); + _mm256_storeu_ps(&x[i + 24], _mm256_addsub_ps(da_r * x3, da_i * y3)); + } +#endif +} + + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1); + for (; i < n2; i += 32) { + __m512 y0 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 0]), 0xb1); + __m512 y1 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 16]), 0xb1); + _mm512_storeu_ps(&x[i + 0], da_i * y0); + _mm512_storeu_ps(&x[i + 16], da_i * y1); + } +#else + __m256 da_i = _mm256_set1_ps(alpha[1]) * _mm256_set_ps(1, -1, 1, -1, 1, -1, 1, -1); + for (; i < n2; i += 32) { + __m256 y0 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 0]), 0xb1); + __m256 y1 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 8]), 0xb1); + __m256 y2 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 16]), 0xb1); + __m256 y3 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 24]), 0xb1); + _mm256_storeu_ps(&x[i + 0], da_i * y0); + _mm256_storeu_ps(&x[i + 8], da_i * y1); + _mm256_storeu_ps(&x[i + 16], da_i * y2); + _mm256_storeu_ps(&x[i + 24], da_i * y3); + } +#endif +} + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512 da_r = _mm512_set1_ps(alpha[0]); + for (; i < n2; i += 32) { + _mm512_storeu_ps(&x[i + 0], da_r * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&x[i + 16], da_r * _mm512_loadu_ps(&x[i + 16])); + } +#else + __m256 da_r = _mm256_set1_ps(alpha[0]); + for (; i < n2; i += 32) { + _mm256_storeu_ps(&x[i + 0], da_r * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&x[i + 8], da_r * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&x[i + 16], da_r * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&x[i + 24], da_r * _mm256_loadu_ps(&x[i + 24])); + } +#endif +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512 zero = _mm512_setzero_ps(); + for (; i < n2; i += 32) { + _mm512_storeu_ps(&x[i], zero); + _mm512_storeu_ps(&x[i + 16], zero); + } +#else + __m256 zero = _mm256_setzero_ps(); + for (; i < n2; i += 32) { + _mm256_storeu_ps(&x[i + 0], zero); + _mm256_storeu_ps(&x[i + 8], zero); + _mm256_storeu_ps(&x[i + 16], zero); + _mm256_storeu_ps(&x[i + 24], zero); + } +#endif + +} + +#else +#include "cscal_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S index 98b582c0d..899c5f241 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -1,5215 +1,5215 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/10/20 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK - -* -* -* 2013/10/20 Saar -* Parameter: -* DGEMM_DEFAULT_UNROLL_N 2 -* DGEMM_DEFAULT_UNROLL_M 16 -* DGEMM_DEFAULT_P 192 -* DGEMM_DEFAULT_Q 128 -* A_PR1 512 -* -* -* Performance without prefetch of B: -* 1 thread: 45.8 GFLOPS (MKL: 45) -* 2 threads: 80.0 GFLOPS (MKL: 91) -* 4 threads: 135.0 GFLOPS (MKL: 135) -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 512*8*4 -#define LB2_OFFSET 512*8*2 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -.macro VFMADD231PD_ y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmaddsd \x0,\x1,\x2,\x0 -.endm - -#else - -.macro VFMADD231PD_ y0,y1,y2 - vfmadd231pd \y2,\y1,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmadd231sd \x2,\x1,\x0 -.endm - -#endif - - -#define A_PR1 512 -#define B_PR1 256 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -.macro KERNEL16x3_SUBN - prefetcht0 A_PR1(AO) - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovaps -12 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 A_PR1+64(AO) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovaps -8 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovaps -4 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $ 3*SIZE , BO - addq $ 16*SIZE, AO -.endm - - -.macro KERNEL8x3_SUBN - //prefetcht0 A_PR1(AO) - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovaps -12 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - //prefetcht0 A_PR1+64(AO) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - prefetcht0 B_PR1(BO) - addq $ 3*SIZE , BO - addq $ 8*SIZE, AO -.endm - -.macro KERNEL4x3_SUBN - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $ 3*SIZE , BO - addq $ 4*SIZE, AO -.endm - -.macro KERNEL2x3_SUBN - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -10 * SIZE(BO), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -15 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $ 3*SIZE , BO - addq $ 2*SIZE, AO -.endm - -.macro KERNEL1x3_SUBN - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -10 * SIZE(BO), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $ 3*SIZE , BO - addq $ 1*SIZE, AO -.endm - - - - - - -/******************************************************************************************/ - -.macro KERNEL16x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - - - - -.macro KERNEL16x3_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - prefetcht0 A_PR1+64(AO,%rax,SIZE) - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - addq $12, BI - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $64, %rax - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $3 , BI - addq $16, %rax -.endm - -.macro SAVE16x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm15, %ymm15 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 - vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) - vmovups %ymm15,12 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $12, BI - addq $32, %rax -.endm - -.macro KERNEL8x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $3 , BI - addq $8 , %rax -.endm - -.macro SAVE8x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_2 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_3 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_4 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $12, BI - addq $16, %rax -.endm - -.macro KERNEL4x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $3 , BI - addq $4 , %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $12, BI - addq $8, %rax -.endm - -.macro KERNEL2x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $3 , BI - addq $2 , %rax -.endm - -.macro SAVE2x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm12, %xmm12 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) - -.endm - -/*******************************************************************************************/ - -.macro KERNEL1x3_1 - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $12, BI - addq $4, %rax -.endm - -.macro KERNEL1x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $3 , BI - addq $1 , %rax -.endm - -.macro SAVE1x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $8, BI - addq $64, %rax -.endm - -.macro KERNEL16x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $2, BI - addq $16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $8, BI - addq $32, %rax -.endm - -.macro KERNEL8x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $2, BI - addq $8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_2 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_3 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_4 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $8, BI - addq $16, %rax -.endm - -.macro KERNEL4x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $2, BI - addq $4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $8, BI - addq $8, %rax -.endm - -.macro KERNEL2x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $2, BI - addq $2, %rax -.endm - -.macro SAVE2x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_1 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $8, BI - addq $4, %rax -.endm - -.macro KERNEL1x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $2, BI - addq $1, %rax -.endm - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $4, BI - addq $64, %rax -.endm - -.macro KERNEL16x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $1, BI - addq $16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $4, BI - addq $32, %rax -.endm - -.macro KERNEL8x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $1, BI - addq $8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $4, BI - addq $16, %rax -.endm - -.macro KERNEL4x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $1, BI - addq $4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $4, BI - addq $8, %rax -.endm - -.macro KERNEL2x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $1, BI - addq $2 , %rax -.endm - -.macro SAVE2x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro KERNEL1x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $ 1, BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovups 0 * SIZE(BO1), %xmm0 - vmovsd 0 * SIZE(BO2), %xmm2 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm2, 2*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups 0*SIZE(BO2), %xmm1 - vmovsd %xmm0, 0*SIZE(BO) - vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - prefetcht0 (CO1) - prefetcht0 (CO1,LDC,1) - prefetcht0 (CO1,LDC,2) - prefetcht0 64(CO1) - prefetcht0 64(CO1,LDC,1) - prefetcht0 64(CO1,LDC,2) - - vzeroall - - movq K, %rax - - sarq $1, %rax // K / 8 - je .L6_16 - - ALIGN_5 - -.L6_12: -/* - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) -*/ - KERNEL16x3_SUBN - KERNEL16x3_SUBN -/* - KERNEL16x3_SUBN - KERNEL16x3_SUBN - - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN -*/ - dec %rax - jne .L6_12 - -.L6_16: - movq K, %rax - - andq $1, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUBN - - dec %rax - jne .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_20_6 - - ALIGN_4 - -.L6_20_2: - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - dec %rax - jne .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUBN - - dec %rax - jne .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_26 - - ALIGN_4 - -.L6_22: - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - dec %rax - jne .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUBN - - dec %rax - jne .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_36 - ALIGN_4 - -.L6_32: - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - dec %rax - jne .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUBN - - dec %rax - jne .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3,%rax - je .L6_46 - - ALIGN_4 - -.L6_42: - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - dec %rax - jne .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUBN - - dec %rax - jne .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - prefetcht0 (CO1) - prefetcht0 (CO1,LDC,1) - prefetcht0 (CO1,LDC,2) - prefetcht0 64(CO1) - prefetcht0 64(CO1,LDC,1) - prefetcht0 64(CO1,LDC,2) - - vzeroall - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L7_16 - ALIGN_5 - -.L7_12: -/* - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) -*/ - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - dec %rax - jne .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - ALIGN_5 - -.L7_17: - - KERNEL16x3_SUBN - - dec %rax - jne .L7_17 - - -.L7_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_20_6 - - ALIGN_4 - -.L7_20_2: - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - dec %rax - jne .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUBN - - dec %rax - jne .L7_20_7 - ALIGN_4 - -.L7_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_26 - - ALIGN_4 - -.L7_22: - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - dec %rax - jne .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUBN - - dec %rax - jne .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_36 - - ALIGN_4 - -.L7_32: - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - dec %rax - jne .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUBN - - dec %rax - jne .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_46 - - ALIGN_4 - -.L7_42: - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - dec %rax - jne .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUBN - - dec %rax - jne .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovaps -8 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovaps -4 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + //prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + //prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + + + + +.macro KERNEL16x3_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + prefetcht0 A_PR1+64(AO,%rax,SIZE) + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + addq $12, BI + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $64, %rax + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm15, %ymm15 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 + vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) + vmovups %ymm15,12 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 1, BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S index 0a2ca7ae3..29501df8e 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S @@ -1,3494 +1,3494 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - - -/********************************************************************* -* 2013/10/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK - -* -* -* 2013/10/27 Saar -* Parameter: -* DGEMM_DEFAULT_UNROLL_N 4 -* DGEMM_DEFAULT_UNROLL_M 4 -* DGEMM_DEFAULT_P 512 -* DGEMM_DEFAULT_Q 256 -* A_PR1 512 -* B_PR1 512 -* -* -* Performance at 9216x9216x9216: -* 1 thread: 53.3 GFLOPS (MKL: 54) -* 2 threads: 100.0 GFLOPS (MKL: 97) -* 3 threads: 147.0 GFLOPS (MKL: 133) -* 4 threads: 184.0 GFLOPS (MKL: 170) -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 -#define BO3 %rbp - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 -#define L_BUFFER_SIZE 256*8*12+4096 - -#else - -#define STACKSIZE 256 -#define L_BUFFER_SIZE 128*8*12+512 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - - -#define Ndiv12 24(%rsp) -#define Nmod12 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* Macro definitions -*******************************************************************************************/ - -.macro INIT4x12 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - vxorpd %ymm12, %ymm12, %ymm12 - vxorpd %ymm13, %ymm13, %ymm13 - vxorpd %ymm14, %ymm14, %ymm14 - vxorpd %ymm15, %ymm15, %ymm15 - -.endm - -.macro KERNEL4x12_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - prefetcht0 B_PR1(BO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1+64(BO) - vmovups -8 * SIZE(BO), %ymm2 - prefetcht0 B_PR1+128(BO) - vmovups -4 * SIZE(BO), %ymm3 - vmulpd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+192(BO) - vmulpd %ymm0 ,%ymm2 , %ymm8 - vmulpd %ymm0 ,%ymm3 , %ymm12 - prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vmulpd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 12*SIZE, BO - vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - prefetcht0 B_PR1+128(BO) - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups 0 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 4 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups 8 * SIZE(BO), %ymm3 - addq $ 24*SIZE, BO -.endm - - -.macro KERNEL4x12_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $ 12*SIZE, BO -.endm - -.macro KERNEL4x12_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vmovups -4 * SIZE(BO), %ymm3 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 12*SIZE, BO - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - -.endm - - -.macro SAVE4x12 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm13, %ymm13 - vmulpd %ymm0 , %ymm14, %ymm14 - vmulpd %ymm0 , %ymm15, %ymm15 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 - - vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 - vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL2x12_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vmovddup -4 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vmovddup -3 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - vmovddup -2 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm12 - vmovddup -1 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $ 2*SIZE, AO - vfmadd231pd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE2x12 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - vmulpd %xmm0 , %xmm12, %xmm12 - vmulpd %xmm0 , %xmm13, %xmm13 - vmulpd %xmm0 , %xmm14, %xmm14 - vmulpd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm12, %xmm4 - vaddpd (%rax, LDC), %xmm13, %xmm5 - vaddpd (%rbp), %xmm14, %xmm6 - vaddpd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL1x12_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vmovsd -4 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vmovsd -3 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - vmovsd -2 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm12 - vmovsd -1 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $ 1*SIZE, AO - vfmadd231sd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE1x12 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - vmulsd %xmm0 , %xmm12, %xmm12 - vmulsd %xmm0 , %xmm13, %xmm13 - vmulsd %xmm0 , %xmm14, %xmm14 - vmulsd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm12, %xmm4 - vaddsd (%rax, LDC), %xmm13, %xmm5 - vaddsd (%rbp), %xmm14, %xmm6 - vaddsd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x4 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - -.macro KERNEL4x4_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - - addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -8 * SIZE(BO), %ymm1 - addq $ 8*SIZE, BO -.endm - - -.macro KERNEL4x4_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $ 4*SIZE, BO -.endm - -.macro KERNEL4x4_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $ 4*SIZE, BO - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - -.endm - -.macro SAVE4x4 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL2x4_SUB - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -9 * SIZE(BO), %xmm8 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x4 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL1x4_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -9 * SIZE(BO), %xmm8 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x4 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL4x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $ 2*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 - vaddpd (CO1, LDC), %xmm6, %xmm6 - vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm7 , 2 * SIZE(CO1, LDC) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm6 , %xmm6 , %xmm6 - -.endm - - -.macro KERNEL2x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 2*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm6 , %xmm6 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm6, %xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - -.endm - - -.macro KERNEL1x2_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $ 2*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x1 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - - -.macro KERNEL4x1 - - vbroadcastsd -12 * SIZE(BO), %ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm1 - vbroadcastsd -10 * SIZE(BO), %ymm2 - vbroadcastsd -9 * SIZE(BO), %ymm3 - - vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 - - vbroadcastsd -8 * SIZE(BO), %ymm0 - vbroadcastsd -7 * SIZE(BO), %ymm1 - - vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 - - vbroadcastsd -6 * SIZE(BO), %ymm2 - vbroadcastsd -5 * SIZE(BO), %ymm3 - - vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 - vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 - - addq $ 8 *SIZE, BO - addq $ 32*SIZE, AO - -.endm - - -.macro KERNEL4x1_SUB - vbroadcastsd -12 * SIZE(BO), %ymm2 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm2 , %ymm4 - addq $ 1*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vaddpd %ymm4,%ymm5, %ymm4 - vaddpd %ymm6,%ymm7, %ymm6 - vaddpd %ymm4,%ymm6, %ymm4 - - vmulpd %ymm0 , %ymm4 , %ymm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %ymm4, %ymm4 - -#endif - - vmovups %ymm4 , (CO1) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL2x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $ 1*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x1 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL1x1_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $ 1*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - - addq $ 1*SIZE, CO1 -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $12, %rdi - divq %rdi // N / 12 - movq %rax, Ndiv12 // N / 12 - movq %rdx, Nmod12 // N % 12 - - - movq Ndiv12, J - cmpq $ 0, J - je .L4_0 - ALIGN_4 - -.L12_01: - // copy to sub buffer - movq K, %rax - salq $2,%rax // K * 4 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 - - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $1 , %rax // K / 2 - jz .L12_01a_2 - ALIGN_4 - -.L12_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetcht0 512(BO3) - prefetchw 512(BO) - - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 4 * SIZE(BO1), %ymm5 - vmovups 0 * SIZE(BO2), %ymm2 - vmovups 4 * SIZE(BO2), %ymm6 - vmovups 0 * SIZE(BO3), %ymm3 - vmovups 4 * SIZE(BO3), %ymm7 - - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - - vmovups %ymm5, 12 * SIZE(BO) - vmovups %ymm6, 16 * SIZE(BO) - vmovups %ymm7, 20 * SIZE(BO) - - addq $ 8 * SIZE ,BO1 - addq $ 8 * SIZE ,BO2 - addq $ 8 * SIZE ,BO3 - addq $ 24 *SIZE ,BO - - decq %rax - jnz .L12_01a_1 - - - -.L12_01a_2: - - movq K, %rax - andq $1, %rax // K % 2 - jz .L12_03c - ALIGN_4 - - -.L12_02b: - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 0 * SIZE(BO2), %ymm2 - vmovups 0 * SIZE(BO3), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 4*SIZE,BO3 - addq $ 12*SIZE,BO - decq %rax - jnz .L12_02b - -.L12_03c: - - movq BO3, B // next offset of B - -.L12_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L12_20 - - ALIGN_4 - -.L12_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L12_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L12_12a - - ALIGN_5 -.L12_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L12_12 - -.L12_12a: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_13: - - test $1, %rax - jz .L12_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_14: - - INIT4x12 - - -.L12_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_19 - - ALIGN_4 - -.L12_17: - - KERNEL4x12_SUB - - dec %rax - jne .L12_17 - ALIGN_4 - - -.L12_19: - - SAVE4x12 - - decq I # i -- - jne .L12_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L12_20: - // Test rest of M - - testq $3, M - jz .L12_100 // to next 16 lines of N - - -.L12_30: - testq $2, M - jz .L12_40 - - ALIGN_4 - -.L12_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L12_36 - ALIGN_4 - -.L12_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L12_32 - ALIGN_4 - -.L12_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_39 - - ALIGN_4 - -.L12_37: - - KERNEL2x12_SUB - - dec %rax - jne .L12_37 - ALIGN_4 - - -.L12_39: - - SAVE2x12 - - ALIGN_4 - -.L12_40: - testq $1, M - jz .L12_100 // to next 3 lines of N - - ALIGN_4 - -.L12_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L12_46 - - ALIGN_4 - -.L12_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L12_42 - ALIGN_4 - -.L12_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_49 - - ALIGN_4 - -.L12_47: - - KERNEL1x12_SUB - - dec %rax - jne .L12_47 - ALIGN_4 - - -.L12_49: - - SAVE1x12 - - ALIGN_4 - -.L12_100: - - decq J // j -- - jg .L12_01 - - -.L4_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - sarq $2, J // j = j / 4 - je .L2_0 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x4 - - movq K, %rax - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x4 - - movq K, %rax - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - - ALIGN_4 - -.L4_100: - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L4_10 - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x2 - - movq K, %rax - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x2 - - movq K, %rax - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x2 - - movq K, %rax - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -.L2_100: - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x1 - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x1 - - movq K, %rax - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x1 - - movq K, %rax - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -.L1_100: - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv12 // N / 4 - movq %rdx, Nmod12 // N % 4 - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - - - movq Ndiv12, J - cmpq $ 0, J - je .L2_0 - ALIGN_4 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x4 - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x4 - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L4_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK // number of values in B -#endif - - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L4_10 - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x2 - - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x2 - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x2 - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - -.L2_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK // number of values in B -#endif - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x1 - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x1 - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x1 - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - - -.L1_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK // number of values in B -#endif - - - -.L999: - - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/********************************************************************* +* 2013/10/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/27 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 4 +* DGEMM_DEFAULT_P 512 +* DGEMM_DEFAULT_Q 256 +* A_PR1 512 +* B_PR1 512 +* +* +* Performance at 9216x9216x9216: +* 1 thread: 53.3 GFLOPS (MKL: 54) +* 2 threads: 100.0 GFLOPS (MKL: 97) +* 3 threads: 147.0 GFLOPS (MKL: 133) +* 4 threads: 184.0 GFLOPS (MKL: 170) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermpd $ 0xb1 , %ymm13, %ymm13 + vpermpd $ 0xb1 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv12 // N / 12 + movq %rdx, Nmod12 // N % 12 + + + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $2,%rax // K * 4 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $1 , %rax // K / 2 + jz .L12_01a_2 + ALIGN_4 + +.L12_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetcht0 512(BO3) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm5 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 4 * SIZE(BO2), %ymm6 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups 4 * SIZE(BO3), %ymm7 + + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + + vmovups %ymm5, 12 * SIZE(BO) + vmovups %ymm6, 16 * SIZE(BO) + vmovups %ymm7, 20 * SIZE(BO) + + addq $ 8 * SIZE ,BO1 + addq $ 8 * SIZE ,BO2 + addq $ 8 * SIZE ,BO3 + addq $ 24 *SIZE ,BO + + decq %rax + jnz .L12_01a_1 + + + +.L12_01a_2: + + movq K, %rax + andq $1, %rax // K % 2 + jz .L12_03c + ALIGN_4 + + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 4*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + movq BO3, B // next offset of B + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + decq J // j -- + jg .L12_01 + + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $2, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv12 // N / 4 + movq %rdx, Nmod12 // N % 4 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + + + movq Ndiv12, J + cmpq $ 0, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 19e32ef2c..adaa28bbc 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1,5153 +1,5153 @@ -/********************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 -#define BO3 %rbp - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 -#define L_BUFFER_SIZE 256*8*12+4096 - -#else - -#define STACKSIZE 256 -#define L_BUFFER_SIZE 128*8*12+512 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - - -#define Ndiv12 24(%rsp) -#define Nmod12 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 160 -#define BROADCASTKERNEL - -/******************************************************************************************* -* Macro definitions -*******************************************************************************************/ - -.macro INIT4x12 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - vxorpd %ymm12, %ymm12, %ymm12 - vxorpd %ymm13, %ymm13, %ymm13 - vxorpd %ymm14, %ymm14, %ymm14 - vxorpd %ymm15, %ymm15, %ymm15 - -.endm - -.macro KERNEL4x12_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - prefetcht0 B_PR1(BO) -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - prefetcht0 B_PR1+64(BO) - vmovups -8 * SIZE(BO), %ymm2 - prefetcht0 B_PR1+128(BO) - vmovups -4 * SIZE(BO), %ymm3 - vmulpd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+192(BO) - vmulpd %ymm0 ,%ymm2 , %ymm8 - vmulpd %ymm0 ,%ymm3 , %ymm12 - prefetcht0 B_PR1+256(BO) -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vmulpd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 12*SIZE, BO - vmulpd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M1 - prefetcht0 A_PR1(AO) -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - prefetcht0 B_PR1+128(BO) - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M2 -# if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -# else - vmovups -12 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups 0 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 4 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups 8 * SIZE(BO), %ymm3 - addq $ 24*SIZE, BO -.endm - - -.macro KERNEL4x12_E -# if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -# else - vmovups -12 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $ 12*SIZE, BO -.endm - -.macro KERNEL4x12_SUB - vmovups -12 * SIZE(BO), %ymm1 -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vmovups -4 * SIZE(BO), %ymm3 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 12*SIZE, BO - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - -.endm - - -.macro SAVE4x12 - - prefetcht0 BUFFER1 - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - prefetcht0 64 + BUFFER1 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 -#if B_PR1 > 32 - prefetcht0 128 + BUFFER1 -#endif - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm13, %ymm13 - vmulpd %ymm0 , %ymm14, %ymm14 - vmulpd %ymm0 , %ymm15, %ymm15 -#if B_PR1 > 96 - prefetcht0 192 + BUFFER1 -#endif - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 -#endif - -#if B_PR1 > 160 - prefetcht0 256 + BUFFER1 -#endif - -#if defined BROADCASTKERNEL - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 -#endif - -#if B_PR1 > 224 - prefetcht0 320 + BUFFER1 -#endif - -#ifndef BROADCASTKERNEL - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 -#endif - -#if B_PR1 > 288 - prefetcht0 384 + BUFFER1 -#endif - -#ifndef BROADCASTKERNEL - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - -#if B_PR1 > 352 - prefetcht0 448 + BUFFER1 -#endif - leaq (CO1, LDC, 2), %rax - -#if B_PR1 > 416 - prefetcht0 512 + BUFFER1 -#endif - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht1 56(CO1) - prefetcht1 56(CO1,LDC) - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 - vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 - vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 - vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm9, %ymm9 - vpermilpd $ 0x05 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 - vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - prefetcht1 56(%rbp) - prefetcht1 56(%rbp,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 - vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 - vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 - vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm13, %ymm13 - vpermilpd $ 0x05 , %ymm15, %ymm15 - - vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 - vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - prefetcht1 56(%rbp) - prefetcht1 56(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL2x12_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vmovddup -4 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vmovddup -3 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - vmovddup -2 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm12 - vmovddup -1 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $ 2*SIZE, AO - vfmadd231pd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE2x12 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - vmulpd %xmm0 , %xmm12, %xmm12 - vmulpd %xmm0 , %xmm13, %xmm13 - vmulpd %xmm0 , %xmm14, %xmm14 - vmulpd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm12, %xmm4 - vaddpd (%rax, LDC), %xmm13, %xmm5 - vaddpd (%rbp), %xmm14, %xmm6 - vaddpd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL1x12_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vmovsd -4 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vmovsd -3 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - vmovsd -2 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm12 - vmovsd -1 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $ 1*SIZE, AO - vfmadd231sd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE1x12 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - vmulsd %xmm0 , %xmm12, %xmm12 - vmulsd %xmm0 , %xmm13, %xmm13 - vmulsd %xmm0 , %xmm14, %xmm14 - vmulsd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm12, %xmm4 - vaddsd (%rax, LDC), %xmm13, %xmm5 - vaddsd (%rbp), %xmm14, %xmm6 - vaddsd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - -/******************************************************************************************/ - - -.macro INIT4x8 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - -.endm - -.macro KERNEL4x8_I - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vmulpd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M1 - prefetcht0 A_PR1(AO) -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M2 -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -4 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 0 * SIZE(BO), %ymm2 - addq $ 16*SIZE, BO -.endm - - -.macro KERNEL4x8_E -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - addq $ 8*SIZE, BO -.endm - -.macro KERNEL4x8_SUB - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 8*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - -.endm - - -.macro SAVE4x8 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 56(CO1) - prefetcht0 56(CO1,LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 - vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 - vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 - vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm9 , %ymm9 - vpermilpd $ 0x05 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL2x8_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 2*SIZE, AO - -.endm - -.macro SAVE2x8 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL1x8_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 1*SIZE, AO - -.endm - -.macro SAVE1x8 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - - -/******************************************************************************************/ - -.macro INIT4x4 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - -.macro KERNEL4x4_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - - addq $ 4*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M1 - prefetcht0 A_PR1(AO) -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M2 -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -8 * SIZE(BO), %ymm1 - addq $ 8*SIZE, BO -.endm - - -.macro KERNEL4x4_E -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $ 4*SIZE, BO -.endm - -.macro KERNEL4x4_SUB - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $ 4*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $ 4*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - -.endm - -.macro SAVE4x4 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL2x4_SUB - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -9 * SIZE(BO), %xmm8 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x4 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL1x4_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -9 * SIZE(BO), %xmm8 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x4 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL4x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $ 2*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 - vaddpd (CO1, LDC), %xmm6, %xmm6 - vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm7 , 2 * SIZE(CO1, LDC) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm6 , %xmm6 , %xmm6 - -.endm - - -.macro KERNEL2x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 2*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm6 , %xmm6 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm6, %xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - -.endm - - -.macro KERNEL1x2_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $ 2*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x1 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - - -.macro KERNEL4x1 - - vbroadcastsd -12 * SIZE(BO), %ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm1 - vbroadcastsd -10 * SIZE(BO), %ymm2 - vbroadcastsd -9 * SIZE(BO), %ymm3 - - vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 - - vbroadcastsd -8 * SIZE(BO), %ymm0 - vbroadcastsd -7 * SIZE(BO), %ymm1 - - vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 - - vbroadcastsd -6 * SIZE(BO), %ymm2 - vbroadcastsd -5 * SIZE(BO), %ymm3 - - vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 - vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 - - addq $ 8 *SIZE, BO - addq $ 32*SIZE, AO - -.endm - - -.macro KERNEL4x1_SUB - vbroadcastsd -12 * SIZE(BO), %ymm2 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm2 , %ymm4 - addq $ 1*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vaddpd %ymm4,%ymm5, %ymm4 - vaddpd %ymm6,%ymm7, %ymm6 - vaddpd %ymm4,%ymm6, %ymm4 - - vmulpd %ymm0 , %ymm4 , %ymm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %ymm4, %ymm4 - -#endif - - vmovups %ymm4 , (CO1) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL2x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $ 1*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x1 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL1x1_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $ 1*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - - addq $ 1*SIZE, CO1 -.endm - - -.macro PREFETCHT0_C - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) -.endm -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $24, %rdi - divq %rdi // N / 24 - movq %rax, Ndiv12 // N / 24 - movq %rdx, Nmod12 // N % 24 - - - movq Ndiv12, J - cmpq $ 0, J - je .L8_0 - ALIGN_4 - -.L12_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values from BO1 - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - movq BO2 , B - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - -.L12_02b: - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 4 * SIZE(BO1), %ymm2 - vmovups 0 * SIZE(BO2), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L12_02b - -.L12_03c: - - -.L12_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L12_20 - - ALIGN_4 - -.L12_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L12_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L12_12a - - ALIGN_5 -.L12_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L12_12 - -.L12_12a: - prefetcht0 ALPHA - PREFETCHT0_C - addq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - leaq (CO1,LDC,2),CO1 - KERNEL4x12_M2 - PREFETCHT0_C - subq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - subq LDC,CO1 - subq LDC,CO1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_13: - - test $1, %rax - jz .L12_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_14: - - INIT4x12 - - -.L12_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_19 - - ALIGN_4 - -.L12_17: - - KERNEL4x12_SUB - - dec %rax - jne .L12_17 - ALIGN_4 - - -.L12_19: - - SAVE4x12 - - /* here for the prefetch of next b source block */ - /* the increment should be proportional to GEMM_Q/GEMM_P */ - - salq $3, K -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - prefetcht2 32(B) - prefetcht2 32(B, K, 8) - addq $64, B /* increment */ -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - prefetcht2 32(B) - prefetcht2 32(B, K, 8) - prefetcht2 96(B) - prefetcht2 96(B, K, 8) - addq $128, B /* increment */ -#endif - sarq $3, K - - decq I # i -- - jne .L12_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ - - /* recover the original value of pointer B after prefetch */ - movq M, I - sarq $2, I -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - salq $6, I -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - salq $7, I -#endif - subq I, B - -.L12_20: - // Test rest of M - - testq $3, M - jz .L12_100 // to next 16 lines of N - - -.L12_30: - testq $2, M - jz .L12_40 - - ALIGN_4 - -.L12_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L12_36 - ALIGN_4 - -.L12_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L12_32 - ALIGN_4 - -.L12_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_39 - - ALIGN_4 - -.L12_37: - - KERNEL2x12_SUB - - dec %rax - jne .L12_37 - ALIGN_4 - - -.L12_39: - - SAVE2x12 - - ALIGN_4 - -.L12_40: - testq $1, M - jz .L12_100 // to next 3 lines of N - - ALIGN_4 - -.L12_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L12_46 - - ALIGN_4 - -.L12_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L12_42 - ALIGN_4 - -.L12_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_49 - - ALIGN_4 - -.L12_47: - - KERNEL1x12_SUB - - dec %rax - jne .L12_47 - ALIGN_4 - - -.L12_49: - - SAVE1x12 - - ALIGN_4 - -.L12_100: - - - -/**************************************************************************************************/ - -.L13_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values - movq B, BO2 - leaq (B,%rax, SIZE), BO3 // next offset to BO2 - leaq (BO3,%rax, SIZE), B // next offset to B - - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - - -.L13_02b: - - vmovups 4 * SIZE(BO2), %ymm1 - vmovups 0 * SIZE(BO3), %ymm2 - vmovups 4 * SIZE(BO3), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO2 - addq $ 8*SIZE,BO3 - addq $ 12*SIZE,BO - decq %rax - jnz .L13_02b - - - -.L13_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L13_20 - - ALIGN_4 - -.L13_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L13_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L13_12a - - ALIGN_5 -.L13_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L13_12 - -.L13_12a: - prefetcht0 ALPHA - PREFETCHT0_C - addq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - leaq (CO1,LDC,2),CO1 - KERNEL4x12_M2 - PREFETCHT0_C - subq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - subq LDC,CO1 - subq LDC,CO1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - -.L13_13: - - test $1, %rax - jz .L13_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - - -.L13_14: - - INIT4x12 - - -.L13_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_19 - - ALIGN_4 - -.L13_17: - - KERNEL4x12_SUB - - dec %rax - jne .L13_17 - ALIGN_4 - - -.L13_19: - - SAVE4x12 - - /* here for the prefetch of next b source block */ - /* the increment should be proportional to GEMM_Q/GEMM_P */ - - salq $3, K -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - prefetcht2 (B) - prefetcht2 (B, K, 8) - addq $64, B /* increment */ -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - prefetcht2 (B) - prefetcht2 (B, K, 8) - prefetcht2 64(B) - prefetcht2 64(B, K, 8) - addq $128, B /* increment */ -#endif - sarq $3, K - - decq I # i -- - jne .L13_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ - /* recover the original value of pointer B */ - movq M, I - sarq $2, I -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - salq $6, I -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - salq $7, I -#endif - subq I, B - -.L13_20: - // Test rest of M - - testq $3, M - jz .L13_100 // to next 16 lines of N - - -.L13_30: - testq $2, M - jz .L13_40 - - ALIGN_4 - -.L13_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L13_36 - ALIGN_4 - -.L13_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L13_32 - ALIGN_4 - -.L13_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_39 - - ALIGN_4 - -.L13_37: - - KERNEL2x12_SUB - - dec %rax - jne .L13_37 - ALIGN_4 - - -.L13_39: - - SAVE2x12 - - ALIGN_4 - -.L13_40: - testq $1, M - jz .L13_100 // to next 3 lines of N - - ALIGN_4 - -.L13_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L13_46 - - ALIGN_4 - -.L13_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L13_42 - ALIGN_4 - -.L13_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_49 - - ALIGN_4 - -.L13_47: - - KERNEL1x12_SUB - - dec %rax - jne .L13_47 - ALIGN_4 - - -.L13_49: - - SAVE1x12 - - ALIGN_4 - -.L13_100: - - decq J // j -- - jg .L12_01 - - - - -/**************************************************************************************************/ - -.L8_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - sarq $3, J // j = j / 8 - je .L4_0 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x8 - - movq K, %rax - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x8 - - movq K, %rax - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - - ALIGN_4 - -.L8_100: - - movq K, %rax - salq $3, %rax // * 8 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L8_10 - - - -/**************************************************************************************************/ - -.L4_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - testq $4, J // j = j / 4 - je .L2_0 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - - decq I # i -- - jg .L4_11 - - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x4 - - movq K, %rax - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x4 - - movq K, %rax - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - - ALIGN_4 - -.L4_100: - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x2 - - movq K, %rax - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x2 - - movq K, %rax - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x2 - - movq K, %rax - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -.L2_100: - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x1 - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x1 - - movq K, %rax - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x1 - - movq K, %rax - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -.L1_100: - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $8, %rdi - divq %rdi // N / 8 - movq %rax, Ndiv12 // N / 8 - movq %rdx, Nmod12 // N % 8 - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -/*************************************************************************************************/ -.L8_0: - movq Ndiv12, J - cmpq $ 0, J - je .L4_0 - ALIGN_4 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 8 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x8 - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x8 - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L8_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $8, KK // number of values in B -#endif - - - decq J // j -- - jg .L8_10 - - - - - -/*************************************************************************************************/ -.L4_0: - movq Nmod12, J - testq $4, J - je .L2_0 - ALIGN_4 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x4 - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x4 - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L4_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK // number of values in B -#endif - - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x2 - - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x2 - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x2 - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - -.L2_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK // number of values in B -#endif - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x1 - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x1 - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x1 - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - - -.L1_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK // number of values in B -#endif - - - -.L999: - - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 160 +#define BROADCASTKERNEL + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else + vmovups -12 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else + vmovups -12 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + prefetcht0 BUFFER1 + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + prefetcht0 64 + BUFFER1 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 +#if B_PR1 > 32 + prefetcht0 128 + BUFFER1 +#endif + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 > 96 + prefetcht0 192 + BUFFER1 +#endif + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif + +#if B_PR1 > 160 + prefetcht0 256 + BUFFER1 +#endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif + +#if B_PR1 > 224 + prefetcht0 320 + BUFFER1 +#endif + +#ifndef BROADCASTKERNEL + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + +#if B_PR1 > 288 + prefetcht0 384 + BUFFER1 +#endif + +#ifndef BROADCASTKERNEL + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + +#if B_PR1 > 352 + prefetcht0 448 + BUFFER1 +#endif + leaq (CO1, LDC, 2), %rax + +#if B_PR1 > 416 + prefetcht0 512 + BUFFER1 +#endif + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ + + +.macro INIT4x8 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + +.endm + +.macro KERNEL4x8_I + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vmulpd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M1 + prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -4 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 0 * SIZE(BO), %ymm2 + addq $ 16*SIZE, BO +.endm + + +.macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + addq $ 8*SIZE, BO +.endm + +.macro KERNEL4x8_SUB + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + +.endm + + +.macro SAVE4x8 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL2x8_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 2*SIZE, AO + +.endm + +.macro SAVE2x8 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL1x8_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 1*SIZE, AO + +.endm + +.macro SAVE1x8 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + + +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) +.endm +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $24, %rdi + divq %rdi // N / 24 + movq %rax, Ndiv12 // N / 24 + movq %rdx, Nmod12 // N % 24 + + + movq Ndiv12, J + cmpq $ 0, J + je .L8_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values from BO1 + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + movq BO2 , B + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm2 + vmovups 0 * SIZE(BO2), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 + KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ + + /* recover the original value of pointer B after prefetch */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + + +/**************************************************************************************************/ + +.L13_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values + movq B, BO2 + leaq (B,%rax, SIZE), BO3 // next offset to BO2 + leaq (BO3,%rax, SIZE), B // next offset to B + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + + +.L13_02b: + + vmovups 4 * SIZE(BO2), %ymm1 + vmovups 0 * SIZE(BO3), %ymm2 + vmovups 4 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO2 + addq $ 8*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L13_02b + + + +.L13_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L13_20 + + ALIGN_4 + +.L13_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L13_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L13_12a + + ALIGN_5 +.L13_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L13_12 + +.L13_12a: + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 + KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + +.L13_13: + + test $1, %rax + jz .L13_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_14: + + INIT4x12 + + +.L13_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_19 + + ALIGN_4 + +.L13_17: + + KERNEL4x12_SUB + + dec %rax + jne .L13_17 + ALIGN_4 + + +.L13_19: + + SAVE4x12 + + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + + decq I # i -- + jne .L13_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ + /* recover the original value of pointer B */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + +.L13_20: + // Test rest of M + + testq $3, M + jz .L13_100 // to next 16 lines of N + + +.L13_30: + testq $2, M + jz .L13_40 + + ALIGN_4 + +.L13_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L13_36 + ALIGN_4 + +.L13_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L13_32 + ALIGN_4 + +.L13_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_39 + + ALIGN_4 + +.L13_37: + + KERNEL2x12_SUB + + dec %rax + jne .L13_37 + ALIGN_4 + + +.L13_39: + + SAVE2x12 + + ALIGN_4 + +.L13_40: + testq $1, M + jz .L13_100 // to next 3 lines of N + + ALIGN_4 + +.L13_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L13_46 + + ALIGN_4 + +.L13_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L13_42 + ALIGN_4 + +.L13_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_49 + + ALIGN_4 + +.L13_47: + + KERNEL1x12_SUB + + dec %rax + jne .L13_47 + ALIGN_4 + + +.L13_49: + + SAVE1x12 + + ALIGN_4 + +.L13_100: + + decq J // j -- + jg .L12_01 + + + + +/**************************************************************************************************/ + +.L8_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $3, J // j = j / 8 + je .L4_0 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x8 + + movq K, %rax + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x8 + + movq K, %rax + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + + ALIGN_4 + +.L8_100: + + movq K, %rax + salq $3, %rax // * 8 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L8_10 + + + +/**************************************************************************************************/ + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + testq $4, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $8, %rdi + divq %rdi // N / 8 + movq %rax, Ndiv12 // N / 8 + movq %rdx, Nmod12 // N % 8 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +/*************************************************************************************************/ +.L8_0: + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 8 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x8 + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x8 + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L8_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK // number of values in B +#endif + + + decq J // j -- + jg .L8_10 + + + + + +/*************************************************************************************************/ +.L4_0: + movq Nmod12, J + testq $4, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index 90a4c2b1d..a5daffb94 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -1,670 +1,670 @@ -#include "common.h" -#include -#include - -//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. - -/* row-major c_block */ -#define INNER_KERNEL_k1m1n8 \ - "prefetcht0 384(%1);"\ - "vmovupd (%1),%%zmm5; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" - -#define INNER_KERNEL_k1m2n8 \ - INNER_KERNEL_k1m1n8\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" - -#define INNER_KERNEL_k1m1n16 \ - "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ - "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" - -#define INNER_KERNEL_k1m2n16 \ - INNER_KERNEL_k1m1n16\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" - -#define INNER_KERNEL_k1m1n24 \ - "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ - "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" - -#define INNER_KERNEL_k1m2n24 \ - INNER_KERNEL_k1m1n24\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" - -/* row-major z-partition c_block */ -#define INNER_KERNEL_k1m4n8 \ - "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ - "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ - "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" - -#define INNER_KERNEL_k1m4n16 \ - INNER_KERNEL_k1m4n8\ - "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ - "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" - -#define INNER_KERNEL_k1m4n24 \ - INNER_KERNEL_k1m4n16\ - "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ - "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" - -#define INNER_KERNEL_k1m8n8 \ - "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ - "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ - "prefetcht0 128(%1);"\ - "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ - "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" - -#define INNER_KERNEL_k1m8n16 \ - INNER_KERNEL_k1m8n8\ - "prefetcht0 128(%1,%%r12,2);"\ - "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ - "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" - -#define INNER_KERNEL_k1m8n24 \ - INNER_KERNEL_k1m8n16\ - "prefetcht0 128(%1,%%r12,4);"\ - "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ - "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" - -/* micro kernels */ -#define INNER_KERNELm1(nn) \ - "cmpq $1,%2;jb "#nn"3f;"\ - #nn"4:\n\t"\ - INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ - "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ - #nn"3:\n\t" - -#define INNER_KERNELm2(nn) \ - "cmpq $1,%2;jb "#nn"0f;"\ - #nn"1:\n\t"\ - INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ - "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ - #nn"0:\n\t" - -#define INNER_KERNELm4(nn) \ - "cmpq $1,%2;jb "#nn"00f;"\ - #nn"01:\n\t"\ - INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ - "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ - #nn"00:\n\t" - -/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ -#define INNER_KERNELm8(nn) \ - "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ - #nn"008:\n\t"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "prefetcht1 (%11); addq $32,%11;"\ - "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ - "movq %3,%10;"\ - #nn"001:\n\t"\ - "cmpq $1,%2;jb "#nn"000f;"\ - "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "decq %2;jmp "#nn"001b;"\ - ""#nn"000:\n\t" - -#define INNER_INIT_m1n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8;" - -#define INNER_INIT_m2n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" - -#define INNER_INIT_m4n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" - -#define INNER_INIT_m8n8 \ - INNER_INIT_m4n8\ - "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" - -#define INNER_INIT_m1n16 INNER_INIT_m2n8 - -#define INNER_INIT_m2n16 INNER_INIT_m4n8 - -#define INNER_INIT_m4n16 INNER_INIT_m8n8 - -#define INNER_INIT_m8n16 \ - INNER_INIT_m8n8\ - "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ - "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" - -#define INNER_INIT_m1n24 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" - -#define INNER_INIT_m2n24 \ - INNER_INIT_m1n24\ - "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" - -#define INNER_INIT_m4n24 \ - INNER_INIT_m4n16\ - "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" - -#define INNER_INIT_m8n24 \ - INNER_INIT_m8n16\ - "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ - "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" - -#define INNER_SETINDEX \ - "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ - "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" - -#define INNER_STORE_m1n8(c1,disp) \ - "kxnorw %%k1,%%k1,%%k1;"\ - "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\ - "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ - "kxnorw %%k1,%%k1,%%k1;"\ - "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};" - -#define INNER_SAVE_m1n8 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0) - -#define INNER_SAVE_m1n16 \ - INNER_SAVE_m1n8\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0) - -#define INNER_SAVE_m1n24 \ - INNER_SAVE_m1n16\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm10,0) - -#define INNER_SAVE_m2n8 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm9,8) - -#define INNER_SAVE_m2n16 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm10,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0)\ - INNER_STORE_m1n8(%%zmm11,8) - -#define INNER_SAVE_m2n24 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm11,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0)\ - INNER_STORE_m1n8(%%zmm12,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm10,0)\ - INNER_STORE_m1n8(%%zmm13,8) - -#define INNER_TRANS_4x8(c1,c2,c3,c4) \ - "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ - "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ - "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ - "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ - -#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ - "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ - "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ - "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ - "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" - -#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) - -//%7 for k01(input) only when m=4 -#define INNER_STORE_4x8(c1,c2,c3,c4) \ - "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ - "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ - "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ - "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ - "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "leaq (%10,%4,4),%10;" - -#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;" - -#define INNER_SAVE_m4n8 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ - INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) - -#define INNER_SAVE_m4n16 \ - INNER_SAVE_m4n8\ - INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ - INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) - -#define INNER_SAVE_m4n24 \ - INNER_SAVE_m4n16\ - INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ - INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) - -#define INNER_SAVE_m8n8 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ - INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) - -#define INNER_SAVE_m8n16 \ - INNER_SAVE_m8n8\ - INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ - INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) - -#define INNER_SAVE_m8n24 \ - INNER_SAVE_m8n16\ - INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ - INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) - -#define COMPUTE_n8 {\ - b_pref = packed_b_pointer + 8 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 42222f;"\ - "42221:\n\t"\ - INNER_INIT_m8n8\ - INNER_KERNELm8(8)\ - INNER_SAVE_m8n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ - "42222:\n\t"\ - "cmpq $4,%8; jb 42223f;"\ - INNER_INIT_m4n8\ - INNER_KERNELm4(8)\ - INNER_SAVE_m4n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "42223:\n\t"\ - "cmpq $2,%8; jb 42224f;"\ - INNER_INIT_m2n8\ - INNER_KERNELm2(8)\ - INNER_SAVE_m2n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "42224:\n\t"\ - "cmpq $1,%8; jb 42225f;"\ - INNER_INIT_m1n8\ - INNER_KERNELm1(8)\ - INNER_SAVE_m1n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "42225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -#define COMPUTE_n16 {\ - b_pref = packed_b_pointer + 16 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 32222f;"\ - "32221:\n\t"\ - INNER_INIT_m8n16\ - INNER_KERNELm8(16)\ - INNER_SAVE_m8n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ - "32222:\n\t"\ - "cmpq $4,%8; jb 32223f;"\ - INNER_INIT_m4n16\ - INNER_KERNELm4(16)\ - INNER_SAVE_m4n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "32223:\n\t"\ - "cmpq $2,%8; jb 32224f;"\ - INNER_INIT_m2n16\ - INNER_KERNELm2(16)\ - INNER_SAVE_m2n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "32224:\n\t"\ - "cmpq $1,%8; jb 32225f;"\ - INNER_INIT_m1n16\ - INNER_KERNELm1(16)\ - INNER_SAVE_m1n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "32225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ - "leaq (%1,%%r12,4),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ - "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -#define COMPUTE_n24 {\ - b_pref = packed_b_pointer + 24 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 22222f;"\ - "22221:\n\t"\ - INNER_INIT_m8n24\ - INNER_KERNELm8(24)\ - INNER_SAVE_m8n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ - "22222:\n\t"\ - "cmpq $4,%8; jb 22223f;"\ - INNER_INIT_m4n24\ - INNER_KERNELm4(24)\ - INNER_SAVE_m4n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "22223:\n\t"\ - "cmpq $2,%8; jb 22224f;"\ - INNER_INIT_m2n24\ - INNER_KERNELm2(24)\ - INNER_SAVE_m2n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "22224:\n\t"\ - "cmpq $1,%8; jb 22225f;"\ - INNER_INIT_m1n24\ - INNER_KERNELm1(24)\ - INNER_SAVE_m1n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "22225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ - "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ - "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ - "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 -//perform C += A B - if(k==0 || m==0 || ndiv8==0) return; - int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); - int64_t K = (int64_t)k; int64_t M = (int64_t)m; - double *a_block_pointer,*b_pref; - double *c_pointer = c,*c_store = c; - __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; - BLASLONG ndiv8_count; - double *packed_b_pointer = packed_b; - a_block_pointer = packed_a; - for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ - COMPUTE_n24 - } - for(;ndiv8_count>1;ndiv8_count-=2){ - COMPUTE_n16 - } - if(ndiv8_count>0){ - COMPUTE_n8 - } -} - -/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ -/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ -/* double accumulator: sc1; temporary variables: sa1,sb1 */ -/* column-major c_block */ -#define KERNEL_m4n4k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ - yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ - b_block_pointer+=4;\ -} -#define KERNEL_m4n2k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ - b_block_pointer+=2;\ -} -#define KERNEL_m4n1k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - b_block_pointer++;\ -} -#define INIT_m4n1 yc1=_mm256_setzero_pd(); -#define INIT_m4n2 yc2=INIT_m4n1 -#define INIT_m4n4 yc4=yc3=INIT_m4n2 -#define SAVE_m4n1 {\ - yb1 = _mm256_broadcast_sd(alpha);\ - ya1 = _mm256_loadu_pd(c_pointer);\ - yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ - _mm256_storeu_pd(c_pointer,yc1);\ - c_pointer += 4;\ -} -#define SAVE_m4n2 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ - c_pointer += 4;\ -} -#define SAVE_m4n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ - c_pointer += LDC*2;\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ - c_pointer += 4-LDC*2;\ -} -#define KERNEL_m2n2k1 {\ - xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ - xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ - b_block_pointer += 2;\ -} -#define KERNEL_m2n1k1 {\ - xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ - xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - b_block_pointer ++;\ -} -#define INIT_m2n1 xc1=_mm_setzero_pd(); -#define INIT_m2n2 xc2=INIT_m2n1 -#define SAVE_m2n1 {\ - xb1 = _mm_loaddup_pd(alpha);\ - xa1 = _mm_loadu_pd(c_pointer);\ - xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ - _mm_storeu_pd(c_pointer,xc1);\ - c_pointer += 2;\ -} -#define SAVE_m2n2 {\ - xa1 = _mm_loaddup_pd(alpha);\ - xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ - xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ - _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ - c_pointer += 2;\ -} -#define KERNEL_m1n1k1 {\ - sa1 = *a_block_pointer; a_block_pointer++;\ - sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ - b_block_pointer ++;\ -} -#define INIT_m1n1 sc1=0.0; -#define SAVE_m1n1 {\ - *c_pointer += sc1 * (*alpha);\ - c_pointer++;\ -} -/* row-major c_block */ -#define KERNEL_m2n4k1 {\ - yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ - ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ - a_block_pointer += 2;\ -} -#define KERNEL_m1n4k1 {\ - yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ - ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - a_block_pointer ++;\ -} -#define KERNEL_m1n2k1 {\ - xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ - xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - a_block_pointer ++;\ -} -#define INIT_m1n2 INIT_m2n1 -#define INIT_m1n4 INIT_m4n1 -#define INIT_m2n4 INIT_m4n2 -#define SAVE_m2n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yc1 = _mm256_mul_pd(yc1,ya1);\ - yc2 = _mm256_mul_pd(yc2,ya1);\ - yb1 = _mm256_unpacklo_pd(yc1,yc2);\ - yb2 = _mm256_unpackhi_pd(yc1,yc2);\ - xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ - xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ - _mm_storeu_pd(c_pointer,xb1);\ - _mm_storeu_pd(c_pointer+LDC,xb2);\ - xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ - xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ - _mm_storeu_pd(c_pointer+2*LDC,xb1);\ - _mm_storeu_pd(c_pointer+3*LDC,xb2);\ - c_pointer += 2;\ -} -#define SAVE_m1n2 {\ - xb1 = _mm_loaddup_pd(alpha);\ - xc1 = _mm_mul_pd(xc1,xb1);\ - *c_pointer += _mm_cvtsd_f64(xc1);\ - xa1 = _mm_unpackhi_pd(xc1,xc1);\ - c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ - c_pointer ++;\ -} -#define SAVE_m1n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yc1 = _mm256_mul_pd(yc1,ya1);\ - xb1 = _mm256_extractf128_pd(yc1,0);\ - *c_pointer += _mm_cvtsd_f64(xb1);\ - xb2 = _mm_unpackhi_pd(xb1,xb1);\ - c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ - xb1 = _mm256_extractf128_pd(yc1,1);\ - c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ - xb2 = _mm_unpackhi_pd(xb1,xb1);\ - c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ - c_pointer ++;\ -} -static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 -//perform C += A B , edge_n<8 must be satisfied. - if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; - double *a_block_pointer,*b_block_pointer,*b_base_pointer; - double *c_pointer = c; - __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; - __m128d xc1,xc2,xa1,xb1,xb2; - double sc1,sa1,sb1; - BLASLONG m_count,n_count,k_count; - b_base_pointer = packed_b; -//now start calculation of the edge part - for(n_count=edge_n;n_count>3;n_count-=4){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n4 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n4 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n4 - for(k_count=0;k_count1;n_count-=2){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n2 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n2 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n2 - for(k_count=0;k_count0){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n1 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n1 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n1 - for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); - if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); - return 0; -} +#include "common.h" +#include +#include + +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. + +/* row-major c_block */ +#define INNER_KERNEL_k1m1n8 \ + "prefetcht0 384(%1);"\ + "vmovupd (%1),%%zmm5; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" + +#define INNER_KERNEL_k1m2n8 \ + INNER_KERNEL_k1m1n8\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m1n16 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m2n16 \ + INNER_KERNEL_k1m1n16\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m1n24 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" + +#define INNER_KERNEL_k1m2n24 \ + INNER_KERNEL_k1m1n24\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" + +/* row-major z-partition c_block */ +#define INNER_KERNEL_k1m4n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" + +#define INNER_KERNEL_k1m4n24 \ + INNER_KERNEL_k1m4n16\ + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" + +#define INNER_KERNEL_k1m8n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ + "prefetcht0 128(%1);"\ + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m8n8\ + "prefetcht0 128(%1,%%r12,2);"\ + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" + +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m8n16\ + "prefetcht0 128(%1,%%r12,4);"\ + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" + +/* micro kernels */ +#define INNER_KERNELm1(nn) \ + "cmpq $1,%2;jb "#nn"3f;"\ + #nn"4:\n\t"\ + INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ + #nn"3:\n\t" + +#define INNER_KERNELm2(nn) \ + "cmpq $1,%2;jb "#nn"0f;"\ + #nn"1:\n\t"\ + INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ + #nn"0:\n\t" + +#define INNER_KERNELm4(nn) \ + "cmpq $1,%2;jb "#nn"00f;"\ + #nn"01:\n\t"\ + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ + "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ + #nn"00:\n\t" + +/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ +#define INNER_KERNELm8(nn) \ + "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ + #nn"008:\n\t"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%11); addq $32,%11;"\ + "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ + "movq %3,%10;"\ + #nn"001:\n\t"\ + "cmpq $1,%2;jb "#nn"000f;"\ + "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "decq %2;jmp "#nn"001b;"\ + ""#nn"000:\n\t" + +#define INNER_INIT_m1n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8;" + +#define INNER_INIT_m2n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" + +#define INNER_INIT_m4n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" + +#define INNER_INIT_m8n8 \ + INNER_INIT_m4n8\ + "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" + +#define INNER_INIT_m1n16 INNER_INIT_m2n8 + +#define INNER_INIT_m2n16 INNER_INIT_m4n8 + +#define INNER_INIT_m4n16 INNER_INIT_m8n8 + +#define INNER_INIT_m8n16 \ + INNER_INIT_m8n8\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ + "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" + +#define INNER_INIT_m1n24 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" + +#define INNER_INIT_m2n24 \ + INNER_INIT_m1n24\ + "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" + +#define INNER_INIT_m4n24 \ + INNER_INIT_m4n16\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" + +#define INNER_INIT_m8n24 \ + INNER_INIT_m8n16\ + "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ + "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" + +#define INNER_SETINDEX \ + "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ + "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" + +#define INNER_STORE_m1n8(c1,disp) \ + "kxnorw %%k1,%%k1,%%k1;"\ + "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ + "kxnorw %%k1,%%k1,%%k1;"\ + "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};" + +#define INNER_SAVE_m1n8 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0) + +#define INNER_SAVE_m1n16 \ + INNER_SAVE_m1n8\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0) + +#define INNER_SAVE_m1n24 \ + INNER_SAVE_m1n16\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm10,0) + +#define INNER_SAVE_m2n8 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm9,8) + +#define INNER_SAVE_m2n16 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm10,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm11,8) + +#define INNER_SAVE_m2n24 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm11,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm12,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm10,0)\ + INNER_STORE_m1n8(%%zmm13,8) + +#define INNER_TRANS_4x8(c1,c2,c3,c4) \ + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ + +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" + +#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) + +//%7 for k01(input) only when m=4 +#define INNER_STORE_4x8(c1,c2,c3,c4) \ + "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ + "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ + "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ + "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ + "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "leaq (%10,%4,4),%10;" + +#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;" + +#define INNER_SAVE_m4n8 \ + "movq %3,%10;"\ + INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ + INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) + +#define INNER_SAVE_m4n16 \ + INNER_SAVE_m4n8\ + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m4n24 \ + INNER_SAVE_m4n16\ + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) + +#define INNER_SAVE_m8n8 \ + "movq %3,%10;"\ + INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m8n16 \ + INNER_SAVE_m8n8\ + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) + +#define INNER_SAVE_m8n24 \ + INNER_SAVE_m8n16\ + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) + +#define COMPUTE_n8 {\ + b_pref = packed_b_pointer + 8 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 42222f;"\ + "42221:\n\t"\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ + "42222:\n\t"\ + "cmpq $4,%8; jb 42223f;"\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "42223:\n\t"\ + "cmpq $2,%8; jb 42224f;"\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "42224:\n\t"\ + "cmpq $1,%8; jb 42225f;"\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "42225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n16 {\ + b_pref = packed_b_pointer + 16 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 32222f;"\ + "32221:\n\t"\ + INNER_INIT_m8n16\ + INNER_KERNELm8(16)\ + INNER_SAVE_m8n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ + "32222:\n\t"\ + "cmpq $4,%8; jb 32223f;"\ + INNER_INIT_m4n16\ + INNER_KERNELm4(16)\ + INNER_SAVE_m4n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "32223:\n\t"\ + "cmpq $2,%8; jb 32224f;"\ + INNER_INIT_m2n16\ + INNER_KERNELm2(16)\ + INNER_SAVE_m2n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "32224:\n\t"\ + "cmpq $1,%8; jb 32225f;"\ + INNER_INIT_m1n16\ + INNER_KERNELm1(16)\ + INNER_SAVE_m1n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "32225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n24 {\ + b_pref = packed_b_pointer + 24 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 22222f;"\ + "22221:\n\t"\ + INNER_INIT_m8n24\ + INNER_KERNELm8(24)\ + INNER_SAVE_m8n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ + "22222:\n\t"\ + "cmpq $4,%8; jb 22223f;"\ + INNER_INIT_m4n24\ + INNER_KERNELm4(24)\ + INNER_SAVE_m4n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "22223:\n\t"\ + "cmpq $2,%8; jb 22224f;"\ + INNER_INIT_m2n24\ + INNER_KERNELm2(24)\ + INNER_SAVE_m2n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "22224:\n\t"\ + "cmpq $1,%8; jb 22225f;"\ + INNER_INIT_m1n24\ + INNER_KERNELm1(24)\ + INNER_SAVE_m1n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "22225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 +//perform C += A B + if(k==0 || m==0 || ndiv8==0) return; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); + int64_t K = (int64_t)k; int64_t M = (int64_t)m; + double *a_block_pointer,*b_pref; + double *c_pointer = c,*c_store = c; + __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; + BLASLONG ndiv8_count; + double *packed_b_pointer = packed_b; + a_block_pointer = packed_a; + for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ + COMPUTE_n24 + } + for(;ndiv8_count>1;ndiv8_count-=2){ + COMPUTE_n16 + } + if(ndiv8_count>0){ + COMPUTE_n8 + } +} + +/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ +/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ +/* double accumulator: sc1; temporary variables: sa1,sb1 */ +/* column-major c_block */ +#define KERNEL_m4n4k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ + b_block_pointer+=4;\ +} +#define KERNEL_m4n2k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + b_block_pointer+=2;\ +} +#define KERNEL_m4n1k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + b_block_pointer++;\ +} +#define INIT_m4n1 yc1=_mm256_setzero_pd(); +#define INIT_m4n2 yc2=INIT_m4n1 +#define INIT_m4n4 yc4=yc3=INIT_m4n2 +#define SAVE_m4n1 {\ + yb1 = _mm256_broadcast_sd(alpha);\ + ya1 = _mm256_loadu_pd(c_pointer);\ + yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ + _mm256_storeu_pd(c_pointer,yc1);\ + c_pointer += 4;\ +} +#define SAVE_m4n2 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += 4;\ +} +#define SAVE_m4n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += LDC*2;\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ + c_pointer += 4-LDC*2;\ +} +#define KERNEL_m2n2k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ + b_block_pointer += 2;\ +} +#define KERNEL_m2n1k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + b_block_pointer ++;\ +} +#define INIT_m2n1 xc1=_mm_setzero_pd(); +#define INIT_m2n2 xc2=INIT_m2n1 +#define SAVE_m2n1 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xa1 = _mm_loadu_pd(c_pointer);\ + xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ + _mm_storeu_pd(c_pointer,xc1);\ + c_pointer += 2;\ +} +#define SAVE_m2n2 {\ + xa1 = _mm_loaddup_pd(alpha);\ + xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ + xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ + _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ + c_pointer += 2;\ +} +#define KERNEL_m1n1k1 {\ + sa1 = *a_block_pointer; a_block_pointer++;\ + sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ + b_block_pointer ++;\ +} +#define INIT_m1n1 sc1=0.0; +#define SAVE_m1n1 {\ + *c_pointer += sc1 * (*alpha);\ + c_pointer++;\ +} +/* row-major c_block */ +#define KERNEL_m2n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ + a_block_pointer += 2;\ +} +#define KERNEL_m1n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + a_block_pointer ++;\ +} +#define KERNEL_m1n2k1 {\ + xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ + xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + a_block_pointer ++;\ +} +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m4n1 +#define INIT_m2n4 INIT_m4n2 +#define SAVE_m2n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + yc2 = _mm256_mul_pd(yc2,ya1);\ + yb1 = _mm256_unpacklo_pd(yc1,yc2);\ + yb2 = _mm256_unpackhi_pd(yc1,yc2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ + _mm_storeu_pd(c_pointer,xb1);\ + _mm_storeu_pd(c_pointer+LDC,xb2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ + _mm_storeu_pd(c_pointer+2*LDC,xb1);\ + _mm_storeu_pd(c_pointer+3*LDC,xb2);\ + c_pointer += 2;\ +} +#define SAVE_m1n2 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xc1 = _mm_mul_pd(xc1,xb1);\ + *c_pointer += _mm_cvtsd_f64(xc1);\ + xa1 = _mm_unpackhi_pd(xc1,xc1);\ + c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ + c_pointer ++;\ +} +#define SAVE_m1n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ + xb1 = _mm256_extractf128_pd(yc1,1);\ + c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ + c_pointer ++;\ +} +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied. + if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; + double *a_block_pointer,*b_block_pointer,*b_base_pointer; + double *c_pointer = c; + __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; + __m128d xc1,xc2,xa1,xb1,xb2; + double sc1,sa1,sb1; + BLASLONG m_count,n_count,k_count; + b_base_pointer = packed_b; +//now start calculation of the edge part + for(n_count=edge_n;n_count>3;n_count-=4){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n4 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n4 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n4 + for(k_count=0;k_count1;n_count-=2){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n2 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n2 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n2 + for(k_count=0;k_count0){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n1 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n1 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n1 + for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); + return 0; +} diff --git a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S index 40c5892c6..c353a5913 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S +++ b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S @@ -1,4413 +1,4413 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/06/02 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 360 -* DGEMM_Q 160 -* -* Performance at m x n without prefetch of BO: -* -* 5760x5760 93.4 GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS) -* 5760x5760 84.2 GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS) -* 3840x3840 50.3 GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS) -* -* 5760x5760 56.4 GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS) -* 3840x3840 29.0 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) -* 3840x3840 26.1 GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS) -* -*********************************************************************/ - -/********************************************************************* -* 2013/06/03 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 336 -* DGEMM_Q 168 -* NO_WARMUP 1 -* NO_AFFINITY 1 -* GEMM_MULTITHREAD_THRESHOLD 4 -* -* Performance at m x n with prefetch of BO: -* -* 8064x3840 93.7 GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS) -* 6048x2880 85.1 GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS) -* 6048x2880 52.0 GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS) -* -* 6048x2880 56.3 GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS) -* 4032x1920 29.5 GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS) -* 4032x1920 26.9 GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS) -* -*********************************************************************/ - -/********************************************************************* -* 2013/06/04 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 384 -* DGEMM_Q 168 -* NO_WARMUP 1 -* NO_AFFINITY 1 -* GEMM_MULTITHREAD_THRESHOLD 4 -* -* Performance at m x n with prefetch of BO: -* -* 6144x5376 94.6 GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS) -* 6144x5376 86.0 GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS) -* 4608x4032 52.0 GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS) -* -* 6144x5376 57.3 GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS) -* 4608x4032 29.6 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) -* 4608x4032 26.9 GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - - - - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - - - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - - - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 - movq B, BO1 - leaq (B,%rax,8), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L6_02a - ALIGN_4 - -.L6_02: - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm2 - vmovups 4*SIZE(BO1), %xmm4 - vmovups 6*SIZE(BO1), %xmm6 - vmovsd (BO2), %xmm1 - vmovsd 2*SIZE(BO2), %xmm3 - vmovsd 4*SIZE(BO2), %xmm5 - vmovsd 6*SIZE(BO2), %xmm7 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - decq %rax - jnz .L6_02 - -.L6_02a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_02c - ALIGN_4 - -.L6_02b: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax,8), BO1 // next offset to BO1 - leaq (BO1,%rax,8), BO2 // next offset to BO1 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $2, %rax // k / 4 - jz .L6_03a - ALIGN_4 - - -.L6_03: - - prefetcht0 512(BO2) - prefetchw 512(BO) - vmovups (BO2), %xmm0 - vmovups 2*SIZE(BO2), %xmm2 - vmovups 4*SIZE(BO2), %xmm4 - vmovups 6*SIZE(BO2), %xmm6 - vmovsd 1*SIZE(BO1), %xmm1 - vmovsd 3*SIZE(BO1), %xmm3 - vmovsd 5*SIZE(BO1), %xmm5 - vmovsd 7*SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - decq %rax - jnz .L6_03 - -.L6_03a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_03c - ALIGN_4 - - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $7, M - jz .L7_10 // to next 3 lines of N - - testq $4, M - jz .L6_30 - - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L7_20 - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L7_11 - ALIGN_4 - -.L7_20: - // Test rest of M - - testq $7, M - jz .L7_60 // to next 6 lines of N - - testq $4, M - jz .L7_30 - - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - -.L7_40: - testq $1, M - jz .L7_60 // to next 6 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - - vmovsd %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_0: - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - vmulpd %xmm0, %xmm11,%xmm11 - vmulpd %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - vmulsd %xmm0, %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/06/02 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 360 +* DGEMM_Q 160 +* +* Performance at m x n without prefetch of BO: +* +* 5760x5760 93.4 GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS) +* 5760x5760 84.2 GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS) +* 3840x3840 50.3 GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS) +* +* 5760x5760 56.4 GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS) +* 3840x3840 29.0 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) +* 3840x3840 26.1 GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS) +* +*********************************************************************/ + +/********************************************************************* +* 2013/06/03 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 336 +* DGEMM_Q 168 +* NO_WARMUP 1 +* NO_AFFINITY 1 +* GEMM_MULTITHREAD_THRESHOLD 4 +* +* Performance at m x n with prefetch of BO: +* +* 8064x3840 93.7 GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS) +* 6048x2880 85.1 GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS) +* 6048x2880 52.0 GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS) +* +* 6048x2880 56.3 GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS) +* 4032x1920 29.5 GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS) +* 4032x1920 26.9 GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS) +* +*********************************************************************/ + +/********************************************************************* +* 2013/06/04 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 384 +* DGEMM_Q 168 +* NO_WARMUP 1 +* NO_AFFINITY 1 +* GEMM_MULTITHREAD_THRESHOLD 4 +* +* Performance at m x n with prefetch of BO: +* +* 6144x5376 94.6 GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS) +* 6144x5376 86.0 GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS) +* 4608x4032 52.0 GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS) +* +* 6144x5376 57.3 GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS) +* 4608x4032 29.6 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) +* 4608x4032 26.9 GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 512(BO2) + prefetchw 512(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index adc00cca3..48eb1bcbe 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -1,4523 +1,4523 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/********************************************************************* -* -* 2013/11/13 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/31 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 768 -* DGEMM_Q 168 -* DGEMM_R 12288 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) -* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) -* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) -* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior -* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior -* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) -* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) -* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) -* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 - -#else - -#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 - -#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 - -#endif - - - - -#define A_PR1 512 -#define B_PR1 256 -#define C_PR1 64 - -.macro INIT8x3 - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 -.endm - -.macro KERNEL8x3_INIT - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - vmulpd %xmm1,%xmm0,%xmm4 - vmovddup -11 * SIZE(BO), %xmm2 - vmulpd %xmm2,%xmm0,%xmm5 - vmovddup -10 * SIZE(BO), %xmm3 - vmulpd %xmm3,%xmm0,%xmm6 - vmovups -14 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm7 - vmulpd %xmm2,%xmm0,%xmm8 - vmulpd %xmm3,%xmm0,%xmm9 - vmovups -12 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm10 - vmulpd %xmm2,%xmm0,%xmm11 - addq $ 3 * SIZE, BO - vmulpd %xmm3,%xmm0,%xmm12 - vmovups -10 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm13 - vmovddup -12 * SIZE(BO), %xmm1 - vmulpd %xmm2,%xmm0,%xmm14 - vmovddup -11 * SIZE(BO), %xmm2 - vmulpd %xmm3,%xmm0,%xmm15 -.endm - - -.macro KERNEL8x3_M1 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -12 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M2 - vmovups -8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+64(AO) - vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -9 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -8 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - - -.macro KERNEL8x3_M3 - vmovups 0 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+128(AO) - vmovddup -7 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -6 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -5 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M4 - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup -4 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -3 * SIZE(BO), %xmm1 - addq $ 32 * SIZE, AO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -2 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M5 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - vmovddup -1 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 0 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 1 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M6 - vmovups -8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+64(AO) - vmovddup 2 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 3 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 4 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - - -.macro KERNEL8x3_M7 - vmovups 0 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+128(AO) - vmovddup 5 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 6 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 7 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M8 - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 9 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 10 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) - vmovddup 11 * SIZE(BO), %xmm3 - addq $ 32 * SIZE, AO - addq $ 24 * SIZE, BO -.endm - - -.macro KERNEL8x3_E - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $ 32 * SIZE, AO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $ 21 * SIZE, BO - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_SUBN - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $ 3 * SIZE, BO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $ 8 * SIZE, AO - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro SAVE8x3 - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - prefetcht0 C_PR1(CO1) - prefetcht0 C_PR1(CO1,LDC) - prefetcht0 C_PR1(CO1,LDC,2) - - addq $ 8 * SIZE, CO1 # coffset += 8 -.endm - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - - - - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - - - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - - - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 - movq B, BO1 - leaq (B,%rax,8), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L6_02a - ALIGN_4 - -.L6_02: - prefetcht0 B_PR1(BO1) - prefetcht0 B_PR1(BO2) - prefetchw B_PR1(BO) - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm2 - vmovups 4*SIZE(BO1), %xmm4 - vmovups 6*SIZE(BO1), %xmm6 - vmovsd (BO2), %xmm1 - vmovsd 2*SIZE(BO2), %xmm3 - vmovsd 4*SIZE(BO2), %xmm5 - vmovsd 6*SIZE(BO2), %xmm7 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L6_02 - -.L6_02a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_02c - ALIGN_4 - -.L6_02b: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax,8), BO1 // next offset to BO1 - leaq (BO1,%rax,8), BO2 // next offset to BO1 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $2, %rax // k / 4 - jz .L6_03a - ALIGN_4 - - -.L6_03: - - prefetcht0 B_PR1(BO2) - prefetchw B_PR1(BO) - vmovups (BO2), %xmm0 - vmovups 2*SIZE(BO2), %xmm2 - vmovups 4*SIZE(BO2), %xmm4 - vmovups 6*SIZE(BO2), %xmm6 - vmovsd 1*SIZE(BO1), %xmm1 - vmovsd 3*SIZE(BO1), %xmm3 - vmovsd 5*SIZE(BO1), %xmm5 - vmovsd 7*SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L6_03 - -.L6_03a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_03c - ALIGN_4 - - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 1*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L6_20 - - ALIGN_4 - -.L6_11: - - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - movq K, %rax - sarq $3, %rax // K / 8 - cmpq $3, %rax - jl .L6_13 - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - subq $2, %rax - - ALIGN_5 - -.L6_12: - - prefetcht0 B_PR1-24(BO) - prefetcht0 B_PR1+40(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - prefetcht0 B_PR1+104(BO) - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - dec %rax - jne .L6_12 - -.L6_12_E: - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L6_16 - -.L6_13: - - test $2, %rax - jz .L6_14 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L6_16 - - -.L6_14: - - test $1, %rax - jz .L6_15 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - - jmp .L6_16 - -.L6_15: - - INIT8x3 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL8x3_SUBN - dec %rax - jne .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE8x3 - - decq I # i -- - jg .L6_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $7, M - jz .L7_10 // to next 3 lines of N - - testq $4, M - jz .L6_30 - - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L7_20 - ALIGN_4 - -.L7_11: - - leaq BUFFER2, BO // first buffer to BO - addq $12 * SIZE, BO - movq K, %rax - sarq $3, %rax // K / 8 - cmpq $3, %rax - jl .L7_13 - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - subq $2, %rax - - ALIGN_5 - -.L7_12: - - prefetcht0 B_PR1-24(BO) - prefetcht0 B_PR1+40(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - prefetcht0 B_PR1+104(BO) - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - dec %rax - jne .L7_12 - -.L7_12_E: - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - - -.L7_13: - - test $2, %rax - jz .L7_14 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - -.L7_14: - - test $1, %rax - jz .L7_15 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - - -.L7_15: - - INIT8x3 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - - ALIGN_4 - -.L7_17: - - KERNEL8x3_SUBN - dec %rax - jne .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE8x3 - - decq I # i -- - jg .L7_11 - ALIGN_4 - -.L7_20: - // Test rest of M - - testq $7, M - jz .L7_60 // to next 6 lines of N - - testq $4, M - jz .L7_30 - - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - -.L7_40: - testq $1, M - jz .L7_60 // to next 6 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - - vmovsd %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_0: - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - vmulpd %xmm0, %xmm11,%xmm11 - vmulpd %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - vmulsd %xmm0, %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#endif +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/********************************************************************* +* +* 2013/11/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 768 +* DGEMM_Q 168 +* DGEMM_R 12288 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) +* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior +* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior +* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) +* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) +* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 + +#else + +#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 + +#endif + + + + +#define A_PR1 512 +#define B_PR1 256 +#define C_PR1 64 + +.macro INIT8x3 + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 +.endm + +.macro KERNEL8x3_INIT + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmulpd %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + vmulpd %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm7 + vmulpd %xmm2,%xmm0,%xmm8 + vmulpd %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm10 + vmulpd %xmm2,%xmm0,%xmm11 + addq $ 3 * SIZE, BO + vmulpd %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + vmulpd %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -12 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M2 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -8 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M3 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup -7 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -5 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M4 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup -4 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -3 * SIZE(BO), %xmm1 + addq $ 32 * SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -2 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M5 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmovddup -1 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 0 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 1 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M6 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup 2 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 3 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 4 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M7 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup 5 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 7 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M8 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 10 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) + vmovddup 11 * SIZE(BO), %xmm3 + addq $ 32 * SIZE, AO + addq $ 24 * SIZE, BO +.endm + + +.macro KERNEL8x3_E + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $ 32 * SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $ 21 * SIZE, BO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_SUBN + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $ 3 * SIZE, BO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $ 8 * SIZE, AO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro SAVE8x3 + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + + addq $ 8 * SIZE, CO1 # coffset += 8 +.endm + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 B_PR1(BO1) + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L6_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L6_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L6_12 + +.L6_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + +.L6_13: + + test $2, %rax + jz .L6_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + + +.L6_14: + + test $1, %rax + jz .L6_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + + jmp .L6_16 + +.L6_15: + + INIT8x3 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUBN + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE8x3 + + decq I # i -- + jg .L6_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L7_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L7_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L7_12 + +.L7_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_13: + + test $2, %rax + jz .L7_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + +.L7_14: + + test $1, %rax + jz .L7_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_15: + + INIT8x3 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUBN + dec %rax + jne .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE8x3 + + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c index 74b336f3d..874ef68d6 100644 --- a/kernel/x86_64/dgemm_ncopy_8_skylakex.c +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -52,18 +52,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + FLOAT ctemp17 /*, ctemp18, ctemp19, ctemp20*/ ; + FLOAT /*ctemp21, ctemp22,*/ ctemp23, ctemp24; + FLOAT ctemp25 /*, ctemp26, ctemp27, ctemp28*/ ; + FLOAT /*ctemp29, ctemp30,*/ ctemp31, ctemp32; + FLOAT ctemp33 /*, ctemp34, ctemp35, ctemp36*/ ; + FLOAT /*ctemp37, ctemp38,*/ ctemp39, ctemp40; + FLOAT ctemp41 /*, ctemp42, ctemp43, ctemp44*/ ; + FLOAT /*ctemp45, ctemp46,*/ ctemp47, ctemp48; + FLOAT ctemp49 /*, ctemp50, ctemp51, ctemp52*/ ; + FLOAT /*ctemp53, ctemp54,*/ ctemp55, ctemp56; + FLOAT ctemp57 /*, ctemp58, ctemp59, ctemp60*/ ; + FLOAT /*ctemp61, ctemp62,*/ ctemp63, ctemp64; aoffset = a; diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c index e695f00c5..b11893f5d 100644 --- a/kernel/x86_64/omatcopy_rt.c +++ b/kernel/x86_64/omatcopy_rt.c @@ -142,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ } int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ - float *src, *dst, *dst_tmp, *src_base, *dst_base; + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; BLASLONG cols_left, rows_done; float ALPHA = alpha; if(ALPHA==0.0){ diff --git a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S index 9cc27184d..b31a934f2 100644 --- a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S +++ b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S @@ -1,5231 +1,5231 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -#define KERNEL16x3_1(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_2(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_3(xx) \ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_4(xx) \ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - addq $12, BI ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $64, %rax ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL16x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL16x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovss 0 * SIZE(BO2), %xmm2 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovss 1*SIZE(BO1), %xmm0 - vmovsd 0*SIZE(BO2), %xmm1 - vmovss %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_20_7 - ALIGN_4 - -.L7_20_9: - - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - - vmovss %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - vmulps %xmm0, %xmm11,%xmm11 - vmulps %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - vmulss %xmm0, %xmm5,%xmm5 - vmulss %xmm0, %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulps %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulss %xmm0, %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S index 7c42f1e12..35b01de07 100644 --- a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S +++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S @@ -1,5258 +1,5258 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/********************************************************************* -* -* 2013/10/18 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/29 Saar -* -* Parameter: -* UNROLL_M 16 -* UNROLL_N 2 -* SGEMM_P 768 -* SGEMM_Q 192 -* SGEMM_R 12288 -* A_PR1 384 -* B_PR1 192 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) -* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) -* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) -* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) -* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) -* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) -* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) -* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) -* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -#define KERNEL16x3_1(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_2(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_3(xx) \ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_4(xx) \ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - addq $12, BI ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $64, %rax ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL16x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL16x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovss 0 * SIZE(BO2), %xmm2 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovss 1*SIZE(BO1), %xmm0 - vmovsd 0*SIZE(BO2), %xmm1 - vmovss %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_20_7 - ALIGN_4 - -.L7_20_9: - - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - - vmovss %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - vmulps %xmm0, %xmm11,%xmm11 - vmulps %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - vmulss %xmm0, %xmm5,%xmm5 - vmulss %xmm0, %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulps %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulss %xmm0, %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/29 Saar +* +* Parameter: +* UNROLL_M 16 +* UNROLL_N 2 +* SGEMM_P 768 +* SGEMM_Q 192 +* SGEMM_R 12288 +* A_PR1 384 +* B_PR1 192 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) +* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) +* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) +* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) +* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) +* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) +* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) +* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) +* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index ef156fd27..76ea12fee 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -1,6806 +1,6806 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* SGEMM_DEFAULT_UNROLL_N 4 -* SGEMM_DEFAULT_UNROLL_M 16 -* SGEMM_DEFAULT_P 768 -* SGEMM_DEFAULT_Q 384 -* A_PR1 512 -* B_PR1 512 -* -* -* 2014/07/28 Saar -* Performance at 9216x9216x9216: -* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) -* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) -* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) -* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) -* -*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define BO2 %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#if defined(OS_WINDOWS) -#define L_BUFFER_SIZE 8192 -#else -#define L_BUFFER_SIZE 12288 -#endif - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 - -#else - -#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* 6 lines of N -*******************************************************************************************/ - -.macro KERNEL16x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vmovups -8 * SIZE(AO), %ymm1 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - prefetcht0 A_PR1(AO) - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 16*SIZE, AO - decq %rax -.endm - -.macro SAVE16x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm13, %ymm13 - vmulps %ymm0 , %ymm14, %ymm14 - vmulps %ymm0 , %ymm15, %ymm15 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 - - vaddps (CO2), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2), %ymm11,%ymm11 - - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 - - vaddps (CO2, LDC,2), %ymm14,%ymm14 - vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) - - vmovups %ymm10, (CO2) - vmovups %ymm11, 8 * SIZE(CO2) - - vmovups %ymm12, (CO2, LDC) - vmovups %ymm13, 8 * SIZE(CO2, LDC) - - vmovups %ymm14, (CO2, LDC,2) - vmovups %ymm15, 8 * SIZE(CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE8x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm14, %ymm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps (CO2), %ymm10,%ymm10 - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps (CO2, LDC,2), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm10, (CO2) - vmovups %ymm12, (CO2, LDC) - vmovups %ymm14, (CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x6_SUB - vmovups -16 * SIZE(AO), %xmm0 - vbroadcastss -4 * SIZE(BO), %xmm2 - vbroadcastss -3 * SIZE(BO), %xmm3 - - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - - vbroadcastss -2 * SIZE(BO), %xmm2 - vbroadcastss -1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - - vbroadcastss 0 * SIZE(BO), %xmm2 - vbroadcastss 1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE4x6 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - vmulps %xmm0 , %xmm12, %xmm12 - vmulps %xmm0 , %xmm14, %xmm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO1, LDC,2), %xmm8,%xmm8 - vaddps (CO2), %xmm10,%xmm10 - vaddps (CO2, LDC), %xmm12,%xmm12 - vaddps (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO1, LDC,2) - vmovups %xmm10, (CO2) - vmovups %xmm12, (CO2, LDC) - vmovups %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -15 * SIZE(AO), %xmm1 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE2x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm13, %xmm13 - vmulss %xmm0 , %xmm14, %xmm14 - vmulss %xmm0 , %xmm15, %xmm15 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 - - vaddss (CO2), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2), %xmm11,%xmm11 - - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 - - vaddss (CO2, LDC,2), %xmm14,%xmm14 - vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) - - vmovss %xmm10, (CO2) - vmovss %xmm11, 1 * SIZE(CO2) - - vmovss %xmm12, (CO2, LDC) - vmovss %xmm13, 1 * SIZE(CO2, LDC) - - vmovss %xmm14, (CO2, LDC,2) - vmovss %xmm15, 1 * SIZE(CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 1*SIZE, AO - decq %rax -.endm - -.macro SAVE1x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm14, %xmm14 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss (CO2), %xmm10,%xmm10 - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm10, (CO2) - vmovss %xmm12, (CO2, LDC) - vmovss %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - prefetcht0 64(CO2) - prefetcht0 64(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - addq $ 4 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - addq $ 4 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddss (CO2, LDC), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO2), %xmm8,%xmm8 - vaddss (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - addq $ 2 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - addq $ 2 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - addq $ 1 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - addq $ 1 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - addq $ 1 , BI - addq $ 2 , %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - -/************************************************************************************* -* GEMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $12, %rdi - divq %rdi // N / 12 - movq %rax, Ndiv6 // N / 12 - movq %rdx, Nmod6 // N % 12 - - movq Ndiv6, J - cmpq $0, J - je .L4_00 - ALIGN_4 - - -/*******************************************************************************************/ - -.L6_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq BO2, B // next offset of B - movq K, %rax - - ALIGN_4 - - -.L6_02c: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 4*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_02c - - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - - ALIGN_4 - -.L6_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL16x6_SUB - - jnz .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L6_60 // to next 6 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - - ALIGN_4 - -.L6_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - ALIGN_4 - -.L6_20_7: - - KERNEL8x6_SUB - - jnz .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - - ALIGN_4 - -.L6_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - ALIGN_4 - -.L6_27: - - KERNEL4x6_SUB - - jnz .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - - ALIGN_4 - -.L6_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - ALIGN_4 - -.L6_37: - - KERNEL2x6_SUB - - jnz .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L6_60 // to next 4 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - - ALIGN_4 - -.L6_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - ALIGN_4 - -.L6_47: - - KERNEL1x6_SUB - - jnz .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L6_60: - - -/*******************************************************************************************/ - - -.L7_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq K, %rax - - ALIGN_4 - - -.L7_02c: - - vmovsd 2*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_02c - - movq BO2, B // next offset of B - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - - ALIGN_4 - -.L7_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - ALIGN_4 - -.L7_17: - - KERNEL16x6_SUB - - jnz .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 6 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - - ALIGN_4 - -.L7_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - ALIGN_4 - -.L7_20_7: - - KERNEL8x6_SUB - - jnz .L7_20_7 - ALIGN_4 - - -.L7_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - - ALIGN_4 - -.L7_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - ALIGN_4 - -.L7_27: - - KERNEL4x6_SUB - - jnz .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - - ALIGN_4 - -.L7_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - ALIGN_4 - -.L7_37: - - KERNEL2x6_SUB - - jnz .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 4 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - - ALIGN_4 - -.L7_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - ALIGN_4 - -.L7_47: - - KERNEL1x6_SUB - - jnz .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L7_60: - - decq J // j -- - jg .L6_01 // next 12 lines of N - - - - -/*******************************************************************************************/ -.L4_00: - - movq Nmod6, J - sarq $2, J // j = j / 4 - cmpq $ 0, J - je .L2_00 - ALIGN_4 - - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_00: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#else - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#endif - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(AO), %ymm1 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm13, %ymm13 + vmulps %ymm0 , %ymm14, %ymm14 + vmulps %ymm0 , %ymm15, %ymm15 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 + + vaddps (CO2), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2), %ymm11,%ymm11 + + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 + + vaddps (CO2, LDC,2), %ymm14,%ymm14 + vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) + + vmovups %ymm10, (CO2) + vmovups %ymm11, 8 * SIZE(CO2) + + vmovups %ymm12, (CO2, LDC) + vmovups %ymm13, 8 * SIZE(CO2, LDC) + + vmovups %ymm14, (CO2, LDC,2) + vmovups %ymm15, 8 * SIZE(CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/sgemm_kernel_16x4_sandy.S b/kernel/x86_64/sgemm_kernel_16x4_sandy.S index ea15cd87e..2ee4b1554 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_sandy.S +++ b/kernel/x86_64/sgemm_kernel_16x4_sandy.S @@ -1,3167 +1,3167 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - vaddps %ymm14, %ymm6 , %ymm6 - vaddps %ymm15, %ymm7 , %ymm7 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm8 , %ymm8 - vaddps %ymm13, %ymm9 , %ymm9 - vaddps %ymm14, %ymm10, %ymm10 - vaddps %ymm15, %ymm11, %ymm11 - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - prefetcht0 64(CO2) - prefetcht0 64(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm14, %ymm6 , %ymm6 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm8 , %ymm8 - vaddps %ymm14, %ymm10, %ymm10 - addq $ 4 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm4 , %xmm4 - vaddps %xmm14, %xmm6 , %xmm6 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm8 , %xmm8 - vaddps %xmm14, %xmm10, %xmm10 - addq $ 4 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - vaddss %xmm14, %xmm6 , %xmm6 - vaddss %xmm15, %xmm7 , %xmm7 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm8 , %xmm8 - vaddss %xmm13, %xmm9 , %xmm9 - vaddss %xmm14, %xmm10, %xmm10 - vaddss %xmm15, %xmm11, %xmm11 - addq $ 4 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddss (CO2, LDC), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm14, %xmm6 , %xmm6 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm8 , %xmm8 - vaddss %xmm14, %xmm10, %xmm10 - addq $ 4 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO2), %xmm8,%xmm8 - vaddss (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - vaddps %ymm14, %ymm6 , %ymm6 - vaddps %ymm15, %ymm7 , %ymm7 - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm14, %ymm6 , %ymm6 - addq $ 2 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm4 , %xmm4 - vaddps %xmm14, %xmm6 , %xmm6 - addq $ 2 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - vaddss %xmm14, %xmm6 , %xmm6 - vaddss %xmm15, %xmm7 , %xmm7 - addq $ 2 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm14, %xmm6 , %xmm6 - addq $ 2 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - addq $ 1 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vmulps %ymm2 , %ymm0 , %ymm12 - vaddps %ymm12, %ymm4 , %ymm4 - addq $ 1 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulps %xmm2 , %xmm0 , %xmm12 - vaddps %xmm12, %xmm4 , %xmm4 - addq $ 1 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - addq $ 1 , BI - addq $ 2 , %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulss %xmm2 , %xmm0 , %xmm12 - vaddss %xmm12, %xmm4 , %xmm4 - addq $ 1 , BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + vaddps %ymm14, %ymm6 , %ymm6 + vaddps %ymm15, %ymm7 , %ymm7 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm8 , %ymm8 + vaddps %ymm13, %ymm9 , %ymm9 + vaddps %ymm14, %ymm10, %ymm10 + vaddps %ymm15, %ymm11, %ymm11 + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm14, %ymm6 , %ymm6 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm8 , %ymm8 + vaddps %ymm14, %ymm10, %ymm10 + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm4 , %xmm4 + vaddps %xmm14, %xmm6 , %xmm6 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm8 , %xmm8 + vaddps %xmm14, %xmm10, %xmm10 + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + vaddss %xmm14, %xmm6 , %xmm6 + vaddss %xmm15, %xmm7 , %xmm7 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm8 , %xmm8 + vaddss %xmm13, %xmm9 , %xmm9 + vaddss %xmm14, %xmm10, %xmm10 + vaddss %xmm15, %xmm11, %xmm11 + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm14, %xmm6 , %xmm6 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm8 , %xmm8 + vaddss %xmm14, %xmm10, %xmm10 + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + vaddps %ymm14, %ymm6 , %ymm6 + vaddps %ymm15, %ymm7 , %ymm7 + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm14, %ymm6 , %ymm6 + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm4 , %xmm4 + vaddps %xmm14, %xmm6 , %xmm6 + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + vaddss %xmm14, %xmm6 , %xmm6 + vaddss %xmm15, %xmm7 , %xmm7 + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm14, %xmm6 , %xmm6 + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vmulps %ymm2 , %ymm0 , %ymm12 + vaddps %ymm12, %ymm4 , %ymm4 + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulps %xmm2 , %xmm0 , %xmm12 + vaddps %xmm12, %xmm4 , %xmm4 + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulss %xmm2 , %xmm0 , %xmm12 + vaddss %xmm12, %xmm4 , %xmm4 + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 621ddc622..c9681fa8b 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "sgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 0be2c7e97..07aa51503 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 29d6a9958..45914daf5 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 02bbc1c64..26e5ca7e9 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_U_microk_bulldozer-2.c" diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c index 4e2cd4fe6..dbfcd55d7 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c @@ -1,279 +1,279 @@ -#include "common.h" -#include -#include "strsm_kernel_8x4_haswell_R_common.h" - -#define SOLVE_RN_m8n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1)\ - SOLVE_ri_m8n2(56,6,7,%1)\ - SAVE_SOLUTION_m8n2(6,7,64) - -#define SOLVE_RN_m8n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(6,7,64)\ - SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(8,9,128)\ - SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(10,11,192) - -#define SOLVE_RN_m8n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(6,7,64)\ - SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(8,9,128)\ - SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(10,11,192)\ - SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(12,13,256)\ - SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(14,15,320) - -#define SOLVE_RN_m4n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1)\ - SOLVE_ri_m4n2(56,5,%1)\ - SAVE_SOLUTION_m4n2(5,32) - -#define SOLVE_RN_m4n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(5,32)\ - SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(6,64)\ - SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(7,96) - -#define SOLVE_RN_m4n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(5,32)\ - SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(6,64)\ - SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(7,96)\ - SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(8,128)\ - SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(9,160) - -#define SOLVE_RN_m2n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,0) - -#define SOLVE_RN_m2n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ - SAVE_SOLUTION_m2n4(4,5,0)\ - SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ - SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ - SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ - SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ - SAVE_SOLUTION_m2n4(6,7,32) - -#define SOLVE_RN_m2n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(4,5,0)\ - SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(6,7,32)\ - SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(8,9,64) - -#define SOLVE_RN_m1n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1)\ - SOLVE_col2_ltor_m1n4(16,4,%1)\ - SOLVE_col3_ltor_m1n4(32,4,%1)\ - SOLVE_col4_ltor_m1n4(48,4,%1)\ - SAVE_SOLUTION_m1n4(4,0) - -#define SOLVE_RN_m1n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ - SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ - SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ - SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ - SAVE_SOLUTION_m1n4(4,0)\ - SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ - SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ - SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ - SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ - SAVE_SOLUTION_m1n4(5,16) - -#define SOLVE_RN_m1n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(4,0)\ - SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(5,16)\ - SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(6,32) - -#define GEMM_RN_SIMPLE(mdim,ndim) \ - "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ - "1"#mdim""#ndim"1:\n\t"\ - GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ - "1"#mdim""#ndim"2:\n\t" -#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) -#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) -#define GEMM_RN_m8n12 \ - "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ - "cmpq $8,%5; jb 18122f;"\ - "18121:\n\t"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ - "18122:\n\t"\ - "testq %5,%5; jz 18124f;"\ - "18123:\n\t"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ - "18124:\n\t" -#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) -#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) -#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) -#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) -#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) -#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) -#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) -#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) -#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) - -#define COMPUTE(ndim) {\ - __asm__ __volatile__(\ - "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ - "cmpq $8,%%r11; jb "#ndim"772f;"\ - #ndim"771:\n\t"\ - GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ - #ndim"772:\n\t"\ - "testq $4,%%r11; jz "#ndim"773f;"\ - GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ - #ndim"773:\n\t"\ - "testq $2,%%r11; jz "#ndim"774f;"\ - GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ - #ndim"774:\n\t"\ - "testq $1,%%r11; jz "#ndim"775f;"\ - GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ - #ndim"775:\n\t"\ - "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ - :"r11","r12","r13","r14","r15","cc","memory",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ - a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ -} - -static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT a0, b0; - int i, j, k; - for (i=0; i7;m_count-=8){ - if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); - a_ptr += k * 8; c_ptr += 8; - } - for(;m_count>3;m_count-=4){ - if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); - a_ptr += k * 4; c_ptr += 4; - } - for(;m_count>1;m_count-=2){ - if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); - a_ptr += k * 2; c_ptr += 2; - } - if(m_count>0){ - if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); - a_ptr += k * 1; c_ptr += 1; - } -} -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ - float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; - float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; - BLASLONG n_count = n; - for(;n_count>11;n_count-=12) COMPUTE(12) - for(;n_count>7;n_count-=8) COMPUTE(8) - for(;n_count>3;n_count-=4) COMPUTE(4) - for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} - if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); - return 0; -} +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RN_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1)\ + SOLVE_ri_m8n2(56,6,7,%1)\ + SAVE_SOLUTION_m8n2(6,7,64) + +#define SOLVE_RN_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(10,11,192) + +#define SOLVE_RN_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(10,11,192)\ + SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(12,13,256)\ + SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(14,15,320) + +#define SOLVE_RN_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1)\ + SOLVE_ri_m4n2(56,5,%1)\ + SAVE_SOLUTION_m4n2(5,32) + +#define SOLVE_RN_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(7,96) + +#define SOLVE_RN_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(7,96)\ + SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(8,128)\ + SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(9,160) + +#define SOLVE_RN_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,0) + +#define SOLVE_RN_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(6,7,32) + +#define SOLVE_RN_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(6,7,32)\ + SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(8,9,64) + +#define SOLVE_RN_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1)\ + SOLVE_col2_ltor_m1n4(16,4,%1)\ + SOLVE_col3_ltor_m1n4(32,4,%1)\ + SOLVE_col4_ltor_m1n4(48,4,%1)\ + SAVE_SOLUTION_m1n4(4,0) + +#define SOLVE_RN_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(5,16) + +#define SOLVE_RN_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(5,16)\ + SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(6,32) + +#define GEMM_RN_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) +#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) +#define GEMM_RN_m8n12 \ + "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) +#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) +#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) +#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) +#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) +#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) +#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) +#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) +#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ +} + +static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=0; i7;m_count-=8){ + if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c index ffcbfbbf0..9de3354de 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c @@ -1,281 +1,281 @@ -#include "common.h" -#include -#include "strsm_kernel_8x4_haswell_R_common.h" - -#define SOLVE_RT_m8n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-128) - -#define SOLVE_RT_m8n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ - SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-112,4,5,%1)\ - SOLVE_le_m8n2(-128,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-256) - -#define SOLVE_RT_m8n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ - SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ - SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ - SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ - SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ - SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-176,4,5,%1)\ - SOLVE_le_m8n2(-192,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-384) - -#define SOLVE_RT_m4n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(4,-64) - -#define SOLVE_RT_m4n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ - SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ - SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-112,4,%1)\ - SOLVE_le_m4n2(-128,4,%1)\ - SAVE_SOLUTION_m4n2(4,-128) - -#define SOLVE_RT_m4n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ - SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ - SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ - SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ - SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ - SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ - SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-176,4,%1)\ - SOLVE_le_m4n2(-192,4,%1)\ - SAVE_SOLUTION_m4n2(4,-192) - -#define SOLVE_RT_m2n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-32) - -#define SOLVE_RT_m2n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-64) - -#define SOLVE_RT_m2n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ - SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-96) - -#define SOLVE_RT_m1n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(4,-16) - -#define SOLVE_RT_m1n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-80,4,%1)\ - SOLVE_col3_rtol_m1n4(-96,4,%1)\ - SOLVE_col2_rtol_m1n4(-112,4,%1)\ - SOLVE_col1_rtol_m1n4(-128,4,%1)\ - SAVE_SOLUTION_m1n4(4,-32) - -#define SOLVE_RT_m1n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ - SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ - SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ - SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ - SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-144,4,%1)\ - SOLVE_col3_rtol_m1n4(-160,4,%1)\ - SOLVE_col2_rtol_m1n4(-176,4,%1)\ - SOLVE_col1_rtol_m1n4(-192,4,%1)\ - SAVE_SOLUTION_m1n4(4,-48) - -/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ -#define GEMM_RT_SIMPLE(mdim,ndim) \ - "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ - "1"#mdim""#ndim"1:\n\t"\ - "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ - "1"#mdim""#ndim"2:\n\t" -#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) -#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) -#define GEMM_RT_m8n12 \ - "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ - "cmpq $8,%5; jb 18122f;"\ - "18121:\n\t"\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ - "18122:\n\t"\ - "testq %5,%5; jz 18124f;"\ - "18123:\n\t"\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ - "18124:\n\t" -#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) -#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) -#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) -#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) -#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) -#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) -#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) -#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) -#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) - -#define COMPUTE(ndim) {\ - b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ - __asm__ __volatile__(\ - "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ - "cmpq $8,%%r11; jb "#ndim"772f;"\ - #ndim"771:\n\t"\ - GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ - #ndim"772:\n\t"\ - "testq $4,%%r11; jz "#ndim"773f;"\ - GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ - #ndim"773:\n\t"\ - "testq $2,%%r11; jz "#ndim"774f;"\ - GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ - #ndim"774:\n\t"\ - "testq $1,%%r11; jz "#ndim"775f;"\ - GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ - #ndim"775:\n\t"\ - "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ - :"r11","r12","r13","r14","r15","cc","memory",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ - a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ -} - -static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ - FLOAT a0, b0; - int i, j, k; - for (i=n-1;i>=0;i--) { - b0 = b[i*n+i]; - for (j=0;j7;m_count-=8){ - if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); - solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 8; c_ptr += 8; - } - for(;m_count>3;m_count-=4){ - if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); - solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 4; c_ptr += 4; - } - for(;m_count>1;m_count-=2){ - if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); - solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 2; c_ptr += 2; - } - if(m_count>0){ - if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); - solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 1; c_ptr += 1; - } -} -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ - float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; - float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; - BLASLONG n_count = n; - if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} - if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} - for(;n_count>11;n_count-=12) COMPUTE(12) - for(;n_count>7;n_count-=8) COMPUTE(8) - for(;n_count>3;n_count-=4) COMPUTE(4) - return 0; -} +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RT_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-128) + +#define SOLVE_RT_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-256) + +#define SOLVE_RT_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ + SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-176,4,5,%1)\ + SOLVE_le_m8n2(-192,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-384) + +#define SOLVE_RT_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(4,-64) + +#define SOLVE_RT_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(4,-128) + +#define SOLVE_RT_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ + SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ + SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-176,4,%1)\ + SOLVE_le_m4n2(-192,4,%1)\ + SAVE_SOLUTION_m4n2(4,-192) + +#define SOLVE_RT_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-32) + +#define SOLVE_RT_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-64) + +#define SOLVE_RT_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-96) + +#define SOLVE_RT_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(4,-16) + +#define SOLVE_RT_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(4,-32) + +#define SOLVE_RT_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-144,4,%1)\ + SOLVE_col3_rtol_m1n4(-160,4,%1)\ + SOLVE_col2_rtol_m1n4(-176,4,%1)\ + SOLVE_col1_rtol_m1n4(-192,4,%1)\ + SAVE_SOLUTION_m1n4(4,-48) + +/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ +#define GEMM_RT_SIMPLE(mdim,ndim) \ + "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) +#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) +#define GEMM_RT_m8n12 \ + "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) +#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) +#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) +#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) +#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) +#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) +#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) +#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) +#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ +} + +static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ + FLOAT a0, b0; + int i, j, k; + for (i=n-1;i>=0;i--) { + b0 = b[i*n+i]; + for (j=0;j7;m_count-=8){ + if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; + BLASLONG n_count = n; + if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} + if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h index 36b7aa1a3..970d63578 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h @@ -1,226 +1,226 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ -/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ -/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ - -#define init_m8n4(c1,c2,c3,c4)\ - "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ - "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" -#define INIT_m8n4 init_m8n4(4,5,6,7) -#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) -#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) - -#define init_m4n4(c1,c2,c3,c4)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ - "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" -#define INIT_m4n4 init_m4n4(4,5,6,7) -#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) -#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) - -#define init_m2n4(c1,c2)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" -#define INIT_m2n4 init_m2n4(4,5) -#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) -#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) - -#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" -#define INIT_m1n4 init_m1n4(4) -#define INIT_m1n8 INIT_m1n4 init_m1n4(5) -#define INIT_m1n12 INIT_m1n8 init_m1n4(6) - -#define GEMM_KERNEL_k1m8n4 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ - "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ - "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" -#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" -#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" - -#define GEMM_KERNEL_k1m4n4 \ - "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ - "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ - "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ - "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ - "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" -#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ - "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ - "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" - -#define GEMM_KERNEL_k1m2n4 \ - "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ - "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" -#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ - "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ - "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" - -#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" -#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" -#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" - -#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" - -#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ - "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" - -#define GEMM_SUM_REORDER_2x4(c1,c2)\ - "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ - "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ - "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ - -#define GEMM_SUM_REORDER_1x4(c1)\ - "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" - -#define SOLVE_le_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovsldup %%ymm"#c1",%%ymm1;" - -#define SOLVE_le_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" - -#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_ri_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovshdup %%ymm"#c1",%%ymm1;" - -#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" - -#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ - "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ - "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ - "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m4n2(c1,a_off)\ - "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ - "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m1n4(c1,a_off)\ - "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ + "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ + "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ + "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2)\ + "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ + "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define SOLVE_le_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_le_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_ri_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m4n2(c1,a_off)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m1n4(c1,a_off)\ + "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index c52575d07..27397ccfa 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S index 94e2f6117..6c8b4c872 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S +++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S @@ -1,1404 +1,1404 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S index 848b6f237..bffe5439d 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S +++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S @@ -1,1429 +1,1429 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/********************************************************************* -* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/30 Saar -* -* Parameter: -* UNROLL_M 2 -* UNROLL_N 2 -* ZGEMM_P 384 -* ZGEMM_Q 168 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) -* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) -* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) -* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) -* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) -* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) -* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) -* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) -* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 256*8*4 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 512 -#define B_PR1 256 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2014/06/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/30 Saar +* +* Parameter: +* UNROLL_M 2 +* UNROLL_N 2 +* ZGEMM_P 384 +* ZGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) +* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) +* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) +* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) +* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) +* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) +* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) +* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) +* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 256*8*4 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index f91bfa89b..29729b101 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -1,3881 +1,3881 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/******************************************************************************** -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* ZGEMM_DEFAULT_UNROLL_N 2 -* ZGEMM_DEFAULT_UNROLL_M 4 -* ZGEMM_DEFAULT_P 256 -* ZGEMM_DEFAULT_Q 128 -* A_PR1 512 -* B_PR1 512 -* -* 2014/07/28 Saar -* Performance at 4608x4608x4608: -* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) -* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) -* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) -* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) -* -********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(BULLDOZER) - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#endif - -#else - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#endif - -#endif - -#define A_PR1 512 -#define B_PR1 512 - - - -/***************************************************************************************************/ - -.macro KERNEL4x3_SUB - vmovups (AO), %ymm0 - vmovups 4 * SIZE(AO), %ymm1 - prefetcht0 A_PR1(AO) - - vbroadcastsd (BO), %ymm2 - vbroadcastsd 1 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) - - vbroadcastsd 2 * SIZE(BO), %ymm2 - vbroadcastsd 3 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) - - vbroadcastsd 4 * SIZE(BO), %ymm2 - vbroadcastsd 5 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 - vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - vaddsubpd %ymm4 , %ymm5 ,%ymm5 - vaddsubpd %ymm6 , %ymm7 ,%ymm7 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - vmovapd %ymm5 , %ymm4 - vmovapd %ymm7 , %ymm6 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - vmulpd %ymm4 , %ymm0, %ymm4 - vmulpd %ymm6 , %ymm0, %ymm6 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - vmulpd %ymm5 , %ymm1, %ymm5 - vmulpd %ymm7 , %ymm1, %ymm7 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - - vaddpd (CO1, LDC,2), %ymm4 , %ymm4 - vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - vmovups %ymm4 , (CO1, LDC, 2) - vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - - - -/***************************************************************************************************/ - -.macro KERNEL2x3_SUB - vmovups (AO), %xmm0 - vmovups 2 * SIZE(AO), %xmm1 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE2x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - vaddsubpd %xmm4, %xmm5 ,%xmm5 - vaddsubpd %xmm6, %xmm7 ,%xmm7 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - vmovapd %xmm5, %xmm4 - vmovapd %xmm7, %xmm6 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - vmulpd %xmm4 , %xmm0, %xmm4 - vmulpd %xmm6 , %xmm0, %xmm6 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - vmulpd %xmm5 , %xmm1, %xmm5 - vmulpd %xmm7 , %xmm1, %xmm7 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - - vaddpd (CO1, LDC,2), %xmm4 , %xmm4 - vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - - vmovups %xmm4 , (CO1, LDC,2) - vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) - -.endm - - -/************************************************************************************************/ - - -.macro KERNEL1x3_SUB - vmovups (AO), %xmm0 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE1x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - vaddsubpd %xmm4, %xmm5, %xmm5 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm5, %xmm4 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm4 , %xmm0, %xmm4 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm5 , %xmm1, %xmm5 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - -#ifndef TRMMKERNEL - - vaddpd (CO1) , %xmm8 , %xmm8 - vaddpd (CO1, LDC) , %xmm10, %xmm10 - vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm4 , (CO1, LDC,2) - -.endm - - - - -/***************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - - vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 - vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) - - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.macro KERNEL1x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 - vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13,%ymm12 , %ymm12 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - -#else - vaddsubpd %ymm8, %ymm9 , %ymm9 - vaddsubpd %ymm12,%ymm13, %ymm13 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm13, %ymm12 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm13, %ymm1, %ymm13 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13, %ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 ,4 * SIZE(CO1) - -.endm - - - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - - -/************************************************************************************************/ - - - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -/************************************************************************************************/ -.L6_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L2_00_0 - ALIGN_4 - - - -.L6_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq BO2, B // next offset of B - movq K, %rax - ALIGN_4 - -.L6_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups (BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_00_02b - -.L6_00_02c: - - - -.L6_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L6_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L6_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_4_16 - ALIGN_4 - -.L6_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - jmp .L6_4_12 - ALIGN_4 - -.L6_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_4_19 - ALIGN_4 - -.L6_4_17: - - KERNEL4x3_SUB - - jnz .L6_4_17 - ALIGN_4 - - -.L6_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L6_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L6_2_10: - testq $ 2, M - jz .L6_2_40 // to next 2 lines of N - -.L6_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_16 - ALIGN_4 - -.L6_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - jmp .L6_2_12 - ALIGN_4 - -.L6_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_19 - ALIGN_4 - -.L6_2_17: - - KERNEL2x3_SUB - - jnz .L6_2_17 - ALIGN_4 - - -.L6_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_2_40: - testq $ 1, M - jz .L6_2_60 // to next 2 lines of N - - ALIGN_4 - -.L6_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_46 - - ALIGN_4 - -.L6_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - jmp .L6_2_42 - ALIGN_4 - -.L6_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_49 - - ALIGN_4 - -.L6_2_47: - - KERNEL1x3_SUB - - jnz .L6_2_47 - ALIGN_4 - - -.L6_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L6_2_41 - ALIGN_4 - - - - -.L6_2_60: - - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.L7_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq K, %rax - ALIGN_4 - -.L7_00_02b: - - vmovups 2 * SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_00_02b - -.L7_00_02c: - - movq BO2, B // next offset of B - - -.L7_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L7_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L7_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_4_16 - ALIGN_4 - -.L7_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - jmp .L7_4_12 - ALIGN_4 - -.L7_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_4_19 - - ALIGN_4 - -.L7_4_17: - - KERNEL4x3_SUB - - jnz .L7_4_17 - ALIGN_4 - - -.L7_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L7_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L7_2_10: - testq $ 2, M - jz .L7_2_40 // to next 2 lines of N - -.L7_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_16 - ALIGN_4 - -.L7_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - jmp .L7_2_12 - ALIGN_4 - -.L7_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_19 - - ALIGN_4 - -.L7_2_17: - - KERNEL2x3_SUB - - jnz .L7_2_17 - ALIGN_4 - - -.L7_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_2_40: - testq $ 1, M - jz .L7_2_60 // to next 2 lines of N - - ALIGN_4 - -.L7_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_46 - - ALIGN_4 - -.L7_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - jmp .L7_2_42 - ALIGN_4 - -.L7_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_49 - ALIGN_4 - -.L7_2_47: - - KERNEL1x3_SUB - - jnz .L7_2_47 - ALIGN_4 - - -.L7_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L7_2_41 - ALIGN_4 - - - - -.L7_2_60: - - decq J // j -- - jg .L6_00_01 // next 6 lines of N - -/************************************************************************************************/ - - - -/************************************************************************************************/ -.L2_00_0: - - movq Nmod6, J - sarq $1, J // j = j / 2 - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************************ - TRMM Kernel -************************************************************************************************/ - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - -#endif - - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/******************************************************************************** +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* ZGEMM_DEFAULT_UNROLL_N 2 +* ZGEMM_DEFAULT_UNROLL_M 4 +* ZGEMM_DEFAULT_P 256 +* ZGEMM_DEFAULT_Q 128 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/28 Saar +* Performance at 4608x4608x4608: +* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) +* +********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#endif + +#endif + +#define A_PR1 512 +#define B_PR1 512 + + + +/***************************************************************************************************/ + +.macro KERNEL4x3_SUB + vmovups (AO), %ymm0 + vmovups 4 * SIZE(AO), %ymm1 + prefetcht0 A_PR1(AO) + + vbroadcastsd (BO), %ymm2 + vbroadcastsd 1 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) + + vbroadcastsd 2 * SIZE(BO), %ymm2 + vbroadcastsd 3 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) + + vbroadcastsd 4 * SIZE(BO), %ymm2 + vbroadcastsd 5 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 + vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + vaddsubpd %ymm4 , %ymm5 ,%ymm5 + vaddsubpd %ymm6 , %ymm7 ,%ymm7 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + vmovapd %ymm5 , %ymm4 + vmovapd %ymm7 , %ymm6 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + vmulpd %ymm4 , %ymm0, %ymm4 + vmulpd %ymm6 , %ymm0, %ymm6 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + vmulpd %ymm5 , %ymm1, %ymm5 + vmulpd %ymm7 , %ymm1, %ymm7 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + + vaddpd (CO1, LDC,2), %ymm4 , %ymm4 + vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + vmovups %ymm4 , (CO1, LDC, 2) + vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + + + +/***************************************************************************************************/ + +.macro KERNEL2x3_SUB + vmovups (AO), %xmm0 + vmovups 2 * SIZE(AO), %xmm1 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE2x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + vaddsubpd %xmm4, %xmm5 ,%xmm5 + vaddsubpd %xmm6, %xmm7 ,%xmm7 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + vmovapd %xmm5, %xmm4 + vmovapd %xmm7, %xmm6 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + vmulpd %xmm4 , %xmm0, %xmm4 + vmulpd %xmm6 , %xmm0, %xmm6 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + vmulpd %xmm5 , %xmm1, %xmm5 + vmulpd %xmm7 , %xmm1, %xmm7 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + + vaddpd (CO1, LDC,2), %xmm4 , %xmm4 + vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + + vmovups %xmm4 , (CO1, LDC,2) + vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) + +.endm + + +/************************************************************************************************/ + + +.macro KERNEL1x3_SUB + vmovups (AO), %xmm0 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE1x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + vaddsubpd %xmm4, %xmm5, %xmm5 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm5, %xmm4 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm4 , %xmm0, %xmm4 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm5 , %xmm1, %xmm5 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + +#ifndef TRMMKERNEL + + vaddpd (CO1) , %xmm8 , %xmm8 + vaddpd (CO1, LDC) , %xmm10, %xmm10 + vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm4 , (CO1, LDC,2) + +.endm + + + + +/***************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + + vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) + + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.macro KERNEL1x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 + vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13,%ymm12 , %ymm12 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + +#else + vaddsubpd %ymm8, %ymm9 , %ymm9 + vaddsubpd %ymm12,%ymm13, %ymm13 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm13, %ymm12 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm13, %ymm1, %ymm13 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13, %ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 ,4 * SIZE(CO1) + +.endm + + + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + + +/************************************************************************************************/ + + + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +/************************************************************************************************/ +.L6_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L2_00_0 + ALIGN_4 + + + +.L6_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq BO2, B // next offset of B + movq K, %rax + ALIGN_4 + +.L6_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups (BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_00_02b + +.L6_00_02c: + + + +.L6_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L6_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L6_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_16 + ALIGN_4 + +.L6_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + jmp .L6_4_12 + ALIGN_4 + +.L6_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_19 + ALIGN_4 + +.L6_4_17: + + KERNEL4x3_SUB + + jnz .L6_4_17 + ALIGN_4 + + +.L6_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L6_2_10: + testq $ 2, M + jz .L6_2_40 // to next 2 lines of N + +.L6_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_16 + ALIGN_4 + +.L6_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + jmp .L6_2_12 + ALIGN_4 + +.L6_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_19 + ALIGN_4 + +.L6_2_17: + + KERNEL2x3_SUB + + jnz .L6_2_17 + ALIGN_4 + + +.L6_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_2_40: + testq $ 1, M + jz .L6_2_60 // to next 2 lines of N + + ALIGN_4 + +.L6_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_46 + + ALIGN_4 + +.L6_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + jmp .L6_2_42 + ALIGN_4 + +.L6_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_49 + + ALIGN_4 + +.L6_2_47: + + KERNEL1x3_SUB + + jnz .L6_2_47 + ALIGN_4 + + +.L6_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L6_2_41 + ALIGN_4 + + + + +.L6_2_60: + + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.L7_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq K, %rax + ALIGN_4 + +.L7_00_02b: + + vmovups 2 * SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_00_02b + +.L7_00_02c: + + movq BO2, B // next offset of B + + +.L7_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L7_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L7_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_16 + ALIGN_4 + +.L7_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + jmp .L7_4_12 + ALIGN_4 + +.L7_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_19 + + ALIGN_4 + +.L7_4_17: + + KERNEL4x3_SUB + + jnz .L7_4_17 + ALIGN_4 + + +.L7_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L7_2_10: + testq $ 2, M + jz .L7_2_40 // to next 2 lines of N + +.L7_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_16 + ALIGN_4 + +.L7_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + jmp .L7_2_12 + ALIGN_4 + +.L7_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_19 + + ALIGN_4 + +.L7_2_17: + + KERNEL2x3_SUB + + jnz .L7_2_17 + ALIGN_4 + + +.L7_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_2_40: + testq $ 1, M + jz .L7_2_60 // to next 2 lines of N + + ALIGN_4 + +.L7_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_46 + + ALIGN_4 + +.L7_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + jmp .L7_2_42 + ALIGN_4 + +.L7_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_49 + ALIGN_4 + +.L7_2_47: + + KERNEL1x3_SUB + + jnz .L7_2_47 + ALIGN_4 + + +.L7_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L7_2_41 + ALIGN_4 + + + + +.L7_2_60: + + decq J // j -- + jg .L6_00_01 // next 6 lines of N + +/************************************************************************************************/ + + + +/************************************************************************************************/ +.L2_00_0: + + movq Nmod6, J + sarq $1, J // j = j / 2 + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************************ + TRMM Kernel +************************************************************************************************/ + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + +#endif + + diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 2d6866a78..8fc960610 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -25,10 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_n_microk_haswell-4.c" @@ -231,10 +232,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT xbuffer[8],*ybuffer; -#if 0 -printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y); -#endif - if ( m < 1 ) return(0); if ( n < 1 ) return(0); diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index c2791e0f3..63c8b11a4 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 3744c98bb..45e3531b8 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -25,10 +25,25 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "zscal_microk_skylakex-2.c" +#elif defined(HASWELL) || defined(ZEN) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c new file mode 100644 index 000000000..f9e05e333 --- /dev/null +++ b/kernel/x86_64/zscal_microk_skylakex-2.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + /* _mm512_addsub_pd does not exist so we flip signs for odd elements of da_i */ + __m512d da_r = _mm512_set1_pd(alpha[0]); + __m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m512d x0 = _mm512_loadu_pd(&x[i + 0]); + __m512d x1 = _mm512_loadu_pd(&x[i + 8]); + __m512d y0 = _mm512_permute_pd(x0, 0x55); + __m512d y1 = _mm512_permute_pd(x1, 0x55); + _mm512_storeu_pd(&x[i + 0], _mm512_add_pd(da_r * x0, da_i * y0)); + _mm512_storeu_pd(&x[i + 8], _mm512_add_pd(da_r * x1, da_i * y1)); + } +#else + __m256d da_r = _mm256_set1_pd(alpha[0]); + __m256d da_i = _mm256_set1_pd(alpha[1]); + for (; i < n2; i += 16) { + __m256d x0 = _mm256_loadu_pd(&x[i + 0]); + __m256d x1 = _mm256_loadu_pd(&x[i + 4]); + __m256d x2 = _mm256_loadu_pd(&x[i + 8]); + __m256d x3 = _mm256_loadu_pd(&x[i + 12]); + __m256d y0 = _mm256_permute_pd(x0, 0x05); + __m256d y1 = _mm256_permute_pd(x1, 0x05); + __m256d y2 = _mm256_permute_pd(x2, 0x05); + __m256d y3 = _mm256_permute_pd(x3, 0x05); + _mm256_storeu_pd(&x[i + 0], _mm256_addsub_pd(da_r * x0, da_i * y0)); + _mm256_storeu_pd(&x[i + 4], _mm256_addsub_pd(da_r * x1, da_i * y1)); + _mm256_storeu_pd(&x[i + 8], _mm256_addsub_pd(da_r * x2, da_i * y2)); + _mm256_storeu_pd(&x[i + 12], _mm256_addsub_pd(da_r * x3, da_i * y3)); + } +#endif +} + + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m512d y0 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 0]), 0x55); + __m512d y1 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 8]), 0x55); + _mm512_storeu_pd(&x[i + 0], da_i * y0); + _mm512_storeu_pd(&x[i + 8], da_i * y1); + } +#else + __m256d da_i = _mm256_set1_pd(alpha[1]) * _mm256_set_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m256d y0 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 0]), 0x05); + __m256d y1 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 8]), 0x05); + __m256d y2 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 16]), 0x05); + __m256d y3 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 24]), 0x05); + _mm256_storeu_pd(&x[i + 0], da_i * y0); + _mm256_storeu_pd(&x[i + 4], da_i * y1); + _mm256_storeu_pd(&x[i + 8], da_i * y2); + _mm256_storeu_pd(&x[i + 12], da_i * y3); + } +#endif +} + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512d da_r = _mm512_set1_pd(alpha[0]); + for (; i < n2; i += 16) { + _mm512_storeu_pd(&x[i + 0], da_r * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&x[i + 8], da_r * _mm512_loadu_pd(&x[i + 8])); + } +#else + __m256d da_r = _mm256_set1_pd(alpha[0]); + for (; i < n2; i += 16) { + _mm256_storeu_pd(&x[i + 0], da_r * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], da_r * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&x[i + 8], da_r * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&x[i + 12], da_r * _mm256_loadu_pd(&x[i + 12])); + } +#endif +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n2; i += 16) { + _mm512_storeu_pd(&x[i], zero); + _mm512_storeu_pd(&x[i + 8], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n2; i += 16) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + _mm256_storeu_pd(&x[i + 8], zero); + _mm256_storeu_pd(&x[i + 12], zero); + } +#endif + +} + +#else +#include "zscal_microk_haswell-2.c" +#endif diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c index 83ef3e0d8..b274af292 100644 --- a/lapack-netlib/INSTALL/ilaver.c +++ b/lapack-netlib/INSTALL/ilaver.c @@ -573,7 +573,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* ===================================================================== */ *vers_major__ = 3; - *vers_minor__ = 9; + *vers_minor__ = 11; *vers_patch__ = 0; /* ===================================================================== */ diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f index 79fe597ae..a246c37cb 100644 --- a/lapack-netlib/INSTALL/ilaver.f +++ b/lapack-netlib/INSTALL/ilaver.f @@ -60,7 +60,7 @@ INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH * ===================================================================== VERS_MAJOR = 3 - VERS_MINOR = 9 + VERS_MINOR = 11 VERS_PATCH = 0 * ===================================================================== * diff --git a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c index 1c027f862..a174fcaf0 100644 --- a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c @@ -49,11 +49,9 @@ LAPACKE_dgels (row-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ /* Calling DGELS using row-major layout */ @@ -66,8 +64,8 @@ int main (int argc, const char * argv[]) { /* Locals */ - double A[5][3] = {1,1,1,2,3,4,3,5,2,4,2,5,5,4,3}; - double b[5][2] = {-10,-3,12,14,14,12,16,16,18,16}; + double A[5][3] = {{1,1,1},{2,3,4},{3,5,2},{4,2,5},{5,4,3}}; + double b[5][2] = {{-10,-3},{12,14},{14,12},{16,16},{18,16}}; lapack_int info,m,n,lda,ldb,nrhs; /* Initialization */ diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c index c8bdd6e4e..44a470d47 100644 --- a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c @@ -25,11 +25,9 @@ LAPACKE_dgesv (col-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ /* Includes */ #include @@ -94,7 +92,7 @@ int main(int argc, char **argv) { /* Check for the exact singularity */ if( info > 0 ) { printf( "The diagonal element of the triangular factor of A,\n" ); - printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); + printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); printf( "the solution could not be computed.\n" ); exit( 1 ); } diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c index 35bdcbcae..5411ef049 100644 --- a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c @@ -25,11 +25,9 @@ LAPACKE_dgesv (row-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ #include #include @@ -91,7 +89,7 @@ int main(int argc, char **argv) { /* Check for the exact singularity */ if( info > 0 ) { printf( "The diagonal element of the triangular factor of A,\n" ); - printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); + printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); printf( "the solution could not be computed.\n" ); exit( 1 ); } diff --git a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c index 9b72eb620..19fff7905 100644 --- a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c +++ b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c @@ -28,6 +28,6 @@ void print_matrix_colmajor( char* desc, lapack_int m, lapack_int n, double* mat, void print_vector( char* desc, lapack_int n, lapack_int* vec ) { lapack_int j; printf( "\n %s\n", desc ); - for( j = 0; j < n; j++ ) printf( " %6i", vec[j] ); + for( j = 0; j < n; j++ ) printf( " %6" LAPACK_IFMT, vec[j] ); printf( "\n" ); } diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 14695fdc8..b5a276f5a 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -12,6 +12,7 @@ #include #include +#include /* It seems all current Fortran compilers put strlen at end. * Some historical compilers put strlen after the str argument @@ -80,11 +81,26 @@ extern "C" { /*----------------------------------------------------------------------------*/ #ifndef lapack_int -#define lapack_int int +#if defined(LAPACK_ILP64) +#define lapack_int int64_t +#else +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 +#endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif /* f2c, hence clapack and MacOS Accelerate, returns double instead of float @@ -115,7 +131,7 @@ typedef lapack_logical (*LAPACK_Z_SELECT2) ( const lapack_complex_double*, const lapack_complex_double* ); #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME) -lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, +lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, lapack_int lca, lapack_int lcb #ifdef LAPACK_FORTRAN_STRLEN_END , size_t, size_t @@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base( #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) #endif +#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3) +void LAPACK_ctrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, float* scale, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) +void LAPACK_dtrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, double* scale, + lapack_int* iwork, lapack_int const* liwork, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3) +void LAPACK_strsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, float* scale, + lapack_int* iwork, lapack_int const* liwork, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3) +void LAPACK_ztrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, double* scale, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) void LAPACK_ctrtri_base( char const* uplo, char const* diag, diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index f6fbfcc33..9998b1504 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -2313,6 +2313,19 @@ lapack_int LAPACKE_zlagge( int matrix_layout, lapack_int m, lapack_int n, float LAPACKE_slamch( char cmach ); double LAPACKE_dlamch( char cmach ); +float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab ); +double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab ); +float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab ); +double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab ); + float LAPACKE_slange( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda ); double LAPACKE_dlange( int matrix_layout, char norm, lapack_int m, @@ -4477,6 +4490,23 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ); +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ); +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ); + lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n, @@ -7576,6 +7606,21 @@ double LAPACKE_dlapy3_work( double x, double y, double z ); float LAPACKE_slamch_work( char cmach ); double LAPACKE_dlamch_work( char cmach ); +float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab, float* work ); +double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab, double* work ); +float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab, + float* work ); +double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab, + double* work ); + float LAPACKE_slange_work( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* work ); @@ -10174,6 +10219,35 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, + float* c, lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ); +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, + double* c, lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ); +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ); +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ); + lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag, diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 4a7d15760..c64fc4416 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -42,17 +42,29 @@ extern "C" { #include #include +#include #ifndef lapack_int #if defined(LAPACK_ILP64) -#define lapack_int int64_t +#define lapack_int int64_t #else -#define lapack_int int32_t +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 #endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif #ifndef LAPACK_COMPLEX_CUSTOM diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h index f84604e8a..332a5024f 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_utils.h +++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h @@ -68,7 +68,7 @@ void LAPACKE_xerbla( const char *name, lapack_int info ); /* Compare two chars (case-insensitive) */ lapack_logical LAPACKE_lsame( char ca, char cb ) #if defined __GNUC__ - __attribute__((const)) + __attribute__((const)) #endif ; @@ -128,6 +128,10 @@ void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_float *in, lapack_int ldin, lapack_complex_float *out, lapack_int ldout ); +void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *in, lapack_int ldin, + lapack_complex_float *out, lapack_int ldout ); void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -178,6 +182,10 @@ void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const double *in, lapack_int ldin, double *out, lapack_int ldout ); +void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *in, lapack_int ldin, + double *out, lapack_int ldout ); void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -228,6 +236,10 @@ void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n, const float *in, lapack_int ldin, float *out, lapack_int ldout ); +void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *in, lapack_int ldin, + float *out, lapack_int ldout ); void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -284,6 +296,10 @@ void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_double *in, lapack_int ldin, lapack_complex_double *out, lapack_int ldout ); +void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *in, lapack_int ldin, + lapack_complex_double *out, lapack_int ldout ); /* NaN checkers */ #define LAPACK_SISNAN( x ) ( x != x ) @@ -376,6 +392,10 @@ lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_float *a, lapack_int lda ); +lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *a, + lapack_int lda ); lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -440,6 +460,9 @@ lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const double *a, lapack_int lda ); +lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *a, lapack_int lda ); lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -504,6 +527,9 @@ lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const float *a, lapack_int lda ); +lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *a, lapack_int lda ); lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -574,6 +600,10 @@ lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_double *a, lapack_int lda ); +lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *a, + lapack_int lda ); #ifdef __cplusplus } diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 7f827e1c9..9c02c1445 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -358,6 +358,8 @@ lapacke_clacrm.o \ lapacke_clacrm_work.o \ lapacke_clag2z.o \ lapacke_clag2z_work.o \ +lapacke_clangb.o \ +lapacke_clangb_work.o \ lapacke_clange.o \ lapacke_clange_work.o \ lapacke_clanhe.o \ @@ -842,6 +844,8 @@ lapacke_dlag2s.o \ lapacke_dlag2s_work.o \ lapacke_dlamch.o \ lapacke_dlamch_work.o \ +lapacke_dlangb.o \ +lapacke_dlangb_work.o \ lapacke_dlange.o \ lapacke_dlange_work.o \ lapacke_dlansy.o \ @@ -1414,6 +1418,8 @@ lapacke_slacpy.o \ lapacke_slacpy_work.o \ lapacke_slamch.o \ lapacke_slamch_work.o \ +lapacke_slangb.o \ +lapacke_slangb_work.o \ lapacke_slange.o \ lapacke_slange_work.o \ lapacke_slansy.o \ @@ -2116,6 +2122,8 @@ lapacke_zlacrm.o \ lapacke_zlacrm_work.o \ lapacke_zlag2c.o \ lapacke_zlag2c_work.o \ +lapacke_zlangb.o \ +lapacke_zlangb_work.o \ lapacke_zlange.o \ lapacke_zlange_work.o \ lapacke_zlanhe.o \ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c index 081f5b129..af6a247ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c @@ -61,12 +61,12 @@ lapack_int LAPACKE_cgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -9; LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c index 2257c64df..632ddd661 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c @@ -65,12 +65,12 @@ lapack_int LAPACKE_cgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -13; LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 8406635e9..05ff8d57f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c index ff74939a3..be0b8347f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c @@ -72,12 +72,12 @@ lapack_int LAPACKE_cgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -15; LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c index 7edb1fa9b..311fe6e0a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c @@ -76,12 +76,12 @@ lapack_int LAPACKE_cggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb.c b/lapack-netlib/LAPACKE/src/lapacke_clangb.c new file mode 100644 index 000000000..0d61575aa --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_clangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function clangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab ) +{ + lapack_int info = 0; + float res = 0.; + float* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_clangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_clangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_clangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c new file mode 100644 index 000000000..b5b2cf816 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c @@ -0,0 +1,84 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function clangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab, + float* work ) +{ + lapack_int info = 0; + float res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_clangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + float* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_clangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr.c b/lapack-netlib/LAPACKE/src/lapacke_clantr.c index 88e765f2b..e00b6c578 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr.c @@ -33,8 +33,8 @@ #include "lapacke_utils.h" float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, - lapack_int m, lapack_int n, const lapack_complex_float* a, - lapack_int lda ) + lapack_int m, lapack_int n, const lapack_complex_float* a, + lapack_int lda ) { lapack_int info = 0; float res = 0.; @@ -46,7 +46,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_ctz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index ccd34cecd..8b1492bec 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -42,7 +42,9 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; lapack_complex_float* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_clarfb", -1 ); return -1; @@ -50,59 +52,27 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); + return -8; + } + if( LAPACKE_ctz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_cge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); - return -8; - } - if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); - return -8; - } - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c index 3ad97c22d..90ff0851f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c @@ -42,6 +42,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; lapack_complex_float *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -52,16 +54,14 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -81,6 +81,11 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_clarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_clarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (lapack_complex_float*) LAPACKE_malloc( sizeof(lapack_complex_float) * @@ -102,36 +107,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 ); - return -8; - } - LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 ); - return -8; - } - LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_ctz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_cge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c index 5ec948e7b..e01664bdf 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c @@ -50,16 +50,24 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); lapack_complex_float* v_t = NULL; lapack_complex_float* t_t = NULL; lapack_complex_float* a_t = NULL; lapack_complex_float* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; @@ -69,7 +77,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; @@ -87,13 +95,13 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_0; } t_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,nb) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -105,10 +113,10 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_ctpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -116,7 +124,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_cge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c new file mode 100644 index 000000000..c931aac48 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c new file mode 100644 index 000000000..09c08d92a --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c index c4de72394..424f5d176 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c @@ -59,12 +59,12 @@ lapack_int LAPACKE_dgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -10; LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c index 9efb49ed3..7f4c6881d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c @@ -63,12 +63,12 @@ lapack_int LAPACKE_dgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -14; LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c index 4e1b87681..4a0d427b3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c index effa1b3f5..bc6bf47d9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c @@ -70,12 +70,12 @@ lapack_int LAPACKE_dgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c index ace40a32a..bde1321d7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c @@ -73,12 +73,12 @@ lapack_int LAPACKE_dggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -19; LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c new file mode 100644 index 000000000..ca16ea7f4 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab ) +{ + lapack_int info = 0; + double res = 0.; + double* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dlangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_dlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dlangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c new file mode 100644 index 000000000..ba04c2b62 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab, double* work ) +{ + lapack_int info = 0; + double res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_dlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + double* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_dlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c index 4d1be93d7..b20af0eb4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c @@ -46,7 +46,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_dtz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index 3c3c24c54..82e8fae52 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -41,7 +41,9 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; double* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -1 ); return -1; @@ -49,59 +51,27 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); + return -8; + } + if( LAPACKE_dtz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_dge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); - return -8; - } - if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); - return -8; - } - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c index 57c53bae3..1a68bf762 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c @@ -41,6 +41,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; double *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -51,16 +53,14 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -80,6 +80,11 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_dlarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dlarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (double*) LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,ncols_v) ); @@ -98,36 +103,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 ); - return -8; - } - LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 ); - return -8; - } - LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_dtz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_dge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c index d9ee6226b..366acd369 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c @@ -48,16 +48,24 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); double* v_t = NULL; double* t_t = NULL; double* a_t = NULL; double* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; @@ -67,7 +75,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; @@ -83,12 +91,12 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } - t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,nb) ); + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -99,10 +107,10 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_dtpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -110,7 +118,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c new file mode 100644 index 000000000..c95a772de --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c new file mode 100644 index 000000000..272c35b38 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, double* c, + lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + double* a_t = NULL; + double* b_t = NULL; + double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (double*)LAPACKE_malloc( sizeof(double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c index 0f5a8e004..af6dbedf0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c @@ -59,12 +59,12 @@ lapack_int LAPACKE_sgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -10; LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c index d05ea16e9..67f4982bf 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c @@ -63,12 +63,12 @@ lapack_int LAPACKE_sgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -14; LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c index 0b6406dec..627d2406c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c index a3b09de30..1bd3eacf4 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c @@ -70,12 +70,12 @@ lapack_int LAPACKE_sgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c index d3927e525..b1fbe1902 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c @@ -73,12 +73,12 @@ lapack_int LAPACKE_sggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -19; LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb.c b/lapack-netlib/LAPACKE/src/lapacke_slangb.c new file mode 100644 index 000000000..9ba3f30d8 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_slangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function slangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab ) +{ + lapack_int info = 0; + float res = 0.; + float* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_slangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_slangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_slangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c new file mode 100644 index 000000000..7ef86e9d9 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function slangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab, float* work ) +{ + lapack_int info = 0; + float res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_slangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + float* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_slangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr.c b/lapack-netlib/LAPACKE/src/lapacke_slantr.c index 2f4c65889..e2f67cfd6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr.c @@ -46,7 +46,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_str_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_stz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index 37d51dee5..892648f4b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -41,7 +41,9 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; float* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_slarfb", -1 ); return -1; @@ -49,59 +51,27 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); + return -8; + } + if( LAPACKE_stz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_sge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); - return -8; - } - if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); - return -8; - } - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c index 2f5d61676..d805a947a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c @@ -41,6 +41,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; float *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -51,16 +53,14 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -80,6 +80,11 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_slarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_slarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,ncols_v) ); if( v_t == NULL ) { @@ -97,36 +102,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_str_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 ); - return -8; - } - LAPACKE_str_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_str_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 ); - return -8; - } - LAPACKE_str_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_stz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_sge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c index 095fbdcd9..c5a3a1496 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c @@ -48,16 +48,24 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); float* v_t = NULL; float* t_t = NULL; float* a_t = NULL; float* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; @@ -67,7 +75,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; @@ -83,12 +91,12 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } - t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,nb) ); + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -99,10 +107,10 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_sge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_stpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -110,7 +118,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_sge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c new file mode 100644 index 000000000..1cfc626c2 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c new file mode 100644 index 000000000..3c50e4a45 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, float* c, + lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + float* a_t = NULL; + float* b_t = NULL; + float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (float*)LAPACKE_malloc( sizeof(float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c index 9393f825a..445b9dc1c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c @@ -61,12 +61,12 @@ lapack_int LAPACKE_zgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -9; LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c index e34112c09..29dbf06f0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c @@ -65,12 +65,12 @@ lapack_int LAPACKE_zgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -13; LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c index 528b94a47..1d318e571 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c index 2694c6530..13e2455c6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c @@ -72,12 +72,12 @@ lapack_int LAPACKE_zgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -15; LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c index f9f1ccee1..fe99949b7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c @@ -76,12 +76,12 @@ lapack_int LAPACKE_zggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c new file mode 100644 index 000000000..3a22ad982 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab ) +{ + lapack_int info = 0; + double res = 0.; + double* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zlangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_zlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zlangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c new file mode 100644 index 000000000..d64fb482d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c @@ -0,0 +1,84 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab, + double* work ) +{ + lapack_int info = 0; + double res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_zlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + double* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_zlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c index f6656d84d..4c078b9b0 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c @@ -46,7 +46,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_ztz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 7cd23dde8..25cedb506 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -42,7 +42,9 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; lapack_complex_double* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -1 ); return -1; @@ -50,59 +52,27 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); + return -8; + } + if( LAPACKE_ztz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_zge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); - return -8; - } - if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); - return -8; - } - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c index 1b4f892a1..64eb05263 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c @@ -42,6 +42,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; lapack_complex_double *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -52,16 +54,14 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -81,6 +81,11 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_zlarfb_work", info ); return info; } + if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zlarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (lapack_complex_double*) LAPACKE_malloc( sizeof(lapack_complex_double) * @@ -102,36 +107,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 ); - return -8; - } - LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 ); - return -8; - } - LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_ztz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_zge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c index 643ae1d9d..104efa8f3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c @@ -50,16 +50,24 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); lapack_complex_double* v_t = NULL; lapack_complex_double* t_t = NULL; lapack_complex_double* a_t = NULL; lapack_complex_double* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; @@ -69,7 +77,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; @@ -87,13 +95,13 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_0; } t_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,nb) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -105,10 +113,10 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_ztpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -116,7 +124,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c new file mode 100644 index 000000000..dbc9bcf9f --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c new file mode 100644 index 000000000..a7ebd5da6 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/utils/CMakeLists.txt b/lapack-netlib/LAPACKE/utils/CMakeLists.txt index dd36ee33e..dfb9aa370 100644 --- a/lapack-netlib/LAPACKE/utils/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/utils/CMakeLists.txt @@ -1,39 +1,46 @@ set(UTILS -lapacke_c_nancheck.c lapacke_ctr_trans.c lapacke_make_complex_float.c lapacke_zgb_nancheck.c -lapacke_cgb_nancheck.c lapacke_d_nancheck.c lapacke_s_nancheck.c lapacke_zgb_trans.c -lapacke_cgb_trans.c lapacke_dgb_nancheck.c lapacke_sgb_nancheck.c lapacke_zge_nancheck.c -lapacke_cge_nancheck.c lapacke_dgb_trans.c lapacke_sgb_trans.c lapacke_zge_trans.c -lapacke_cge_trans.c lapacke_dge_nancheck.c lapacke_sge_nancheck.c lapacke_zgg_nancheck.c -lapacke_cgg_nancheck.c lapacke_dge_trans.c lapacke_sge_trans.c lapacke_zgg_trans.c -lapacke_cgg_trans.c lapacke_dgg_nancheck.c lapacke_sgg_nancheck.c lapacke_zgt_nancheck.c -lapacke_cgt_nancheck.c lapacke_dgg_trans.c lapacke_sgg_trans.c lapacke_zhb_nancheck.c -lapacke_chb_nancheck.c lapacke_dgt_nancheck.c lapacke_sgt_nancheck.c lapacke_zhb_trans.c -lapacke_chb_trans.c lapacke_dhs_nancheck.c lapacke_shs_nancheck.c lapacke_zhe_nancheck.c -lapacke_che_nancheck.c lapacke_dhs_trans.c lapacke_shs_trans.c lapacke_zhe_trans.c -lapacke_che_trans.c lapacke_dpb_nancheck.c lapacke_spb_nancheck.c lapacke_zhp_nancheck.c -lapacke_chp_nancheck.c lapacke_dpb_trans.c lapacke_spb_trans.c lapacke_zhp_trans.c -lapacke_chp_trans.c lapacke_dpf_nancheck.c lapacke_spf_nancheck.c lapacke_zhs_nancheck.c -lapacke_chs_nancheck.c lapacke_dpf_trans.c lapacke_spf_trans.c lapacke_zhs_trans.c -lapacke_chs_trans.c lapacke_dpo_nancheck.c lapacke_spo_nancheck.c lapacke_zpb_nancheck.c -lapacke_cpb_nancheck.c lapacke_dpo_trans.c lapacke_spo_trans.c lapacke_zpb_trans.c -lapacke_cpb_trans.c lapacke_dpp_nancheck.c lapacke_spp_nancheck.c lapacke_zpf_nancheck.c -lapacke_cpf_nancheck.c lapacke_dpp_trans.c lapacke_spp_trans.c lapacke_zpf_trans.c -lapacke_cpf_trans.c lapacke_dpt_nancheck.c lapacke_spt_nancheck.c lapacke_zpo_nancheck.c -lapacke_cpo_nancheck.c lapacke_dsb_nancheck.c lapacke_ssb_nancheck.c lapacke_zpo_trans.c -lapacke_cpo_trans.c lapacke_dsb_trans.c lapacke_ssb_trans.c lapacke_zpp_nancheck.c -lapacke_cpp_nancheck.c lapacke_dsp_nancheck.c lapacke_ssp_nancheck.c lapacke_zpp_trans.c -lapacke_cpp_trans.c lapacke_dsp_trans.c lapacke_ssp_trans.c lapacke_zpt_nancheck.c -lapacke_cpt_nancheck.c lapacke_dst_nancheck.c lapacke_sst_nancheck.c lapacke_zsp_nancheck.c -lapacke_csp_nancheck.c lapacke_dsy_nancheck.c lapacke_ssy_nancheck.c lapacke_zsp_trans.c -lapacke_csp_trans.c lapacke_dsy_trans.c lapacke_ssy_trans.c lapacke_zst_nancheck.c -lapacke_cst_nancheck.c lapacke_dtb_nancheck.c lapacke_stb_nancheck.c lapacke_zsy_nancheck.c -lapacke_csy_nancheck.c lapacke_dtb_trans.c lapacke_stb_trans.c lapacke_zsy_trans.c -lapacke_csy_trans.c lapacke_dtf_nancheck.c lapacke_stf_nancheck.c lapacke_ztb_nancheck.c -lapacke_ctb_nancheck.c lapacke_dtf_trans.c lapacke_stf_trans.c lapacke_ztb_trans.c -lapacke_ctb_trans.c lapacke_dtp_nancheck.c lapacke_stp_nancheck.c lapacke_ztf_nancheck.c -lapacke_ctf_nancheck.c lapacke_dtp_trans.c lapacke_stp_trans.c lapacke_ztf_trans.c -lapacke_ctf_trans.c lapacke_dtr_nancheck.c lapacke_str_nancheck.c lapacke_ztp_nancheck.c -lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c -lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c -lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c +lapacke_c_nancheck.c lapacke_d_nancheck.c lapacke_s_nancheck.c lapacke_z_nancheck.c +lapacke_cgb_nancheck.c lapacke_dgb_nancheck.c lapacke_sgb_nancheck.c lapacke_zgb_trans.c +lapacke_cgb_trans.c lapacke_dgb_trans.c lapacke_sgb_trans.c lapacke_zgb_nancheck.c +lapacke_cge_nancheck.c lapacke_dge_nancheck.c lapacke_sge_nancheck.c lapacke_zge_nancheck.c +lapacke_cge_trans.c lapacke_dge_trans.c lapacke_sge_trans.c lapacke_zge_trans.c +lapacke_cgg_nancheck.c lapacke_dgg_nancheck.c lapacke_sgg_nancheck.c lapacke_zgg_nancheck.c +lapacke_cgg_trans.c lapacke_dgg_trans.c lapacke_sgg_trans.c lapacke_zgg_trans.c +lapacke_cgt_nancheck.c lapacke_dgt_nancheck.c lapacke_sgt_nancheck.c lapacke_zgt_nancheck.c +lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_ssb_nancheck.c lapacke_zhb_nancheck.c +lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_ssb_trans.c lapacke_zhb_trans.c +lapacke_che_nancheck.c lapacke_zhe_nancheck.c +lapacke_che_trans.c lapacke_zhe_trans.c +lapacke_chp_nancheck.c lapacke_zhp_nancheck.c +lapacke_chp_trans.c lapacke_zhp_trans.c +lapacke_chs_nancheck.c lapacke_dhs_nancheck.c lapacke_shs_nancheck.c lapacke_zhs_nancheck.c +lapacke_chs_trans.c lapacke_dhs_trans.c lapacke_shs_trans.c lapacke_zhs_trans.c +lapacke_cpb_nancheck.c lapacke_dpb_nancheck.c lapacke_spb_nancheck.c lapacke_zpb_nancheck.c +lapacke_cpb_trans.c lapacke_dpb_trans.c lapacke_spb_trans.c lapacke_zpb_trans.c +lapacke_cpf_nancheck.c lapacke_dpf_nancheck.c lapacke_spf_nancheck.c lapacke_zpf_nancheck.c +lapacke_cpf_trans.c lapacke_dpf_trans.c lapacke_spf_trans.c lapacke_zpf_trans.c +lapacke_cpo_nancheck.c lapacke_dpo_nancheck.c lapacke_spo_nancheck.c lapacke_zpo_nancheck.c +lapacke_cpo_trans.c lapacke_dpo_trans.c lapacke_spo_trans.c lapacke_zpo_trans.c +lapacke_cpp_nancheck.c lapacke_dpp_nancheck.c lapacke_spp_nancheck.c lapacke_zpp_nancheck.c +lapacke_cpp_trans.c lapacke_dpp_trans.c lapacke_spp_trans.c lapacke_zpp_trans.c +lapacke_cpt_nancheck.c lapacke_dpt_nancheck.c lapacke_spt_nancheck.c lapacke_zpt_nancheck.c +lapacke_csp_nancheck.c lapacke_dsp_nancheck.c lapacke_ssp_nancheck.c lapacke_zsp_nancheck.c +lapacke_csp_trans.c lapacke_dsp_trans.c lapacke_ssp_trans.c lapacke_zsp_trans.c +lapacke_cst_nancheck.c lapacke_dst_nancheck.c lapacke_sst_nancheck.c lapacke_zst_nancheck.c +lapacke_csy_nancheck.c lapacke_dsy_nancheck.c lapacke_ssy_nancheck.c lapacke_zsy_nancheck.c +lapacke_csy_trans.c lapacke_dsy_trans.c lapacke_ssy_trans.c lapacke_zsy_trans.c +lapacke_ctb_nancheck.c lapacke_dtb_nancheck.c lapacke_stb_nancheck.c lapacke_ztb_nancheck.c +lapacke_ctb_trans.c lapacke_dtb_trans.c lapacke_stb_trans.c lapacke_ztb_trans.c +lapacke_ctf_nancheck.c lapacke_dtf_nancheck.c lapacke_stf_nancheck.c lapacke_ztf_nancheck.c +lapacke_ctf_trans.c lapacke_dtf_trans.c lapacke_stf_trans.c lapacke_ztf_trans.c +lapacke_ctp_nancheck.c lapacke_dtp_nancheck.c lapacke_stp_nancheck.c lapacke_ztp_nancheck.c +lapacke_ctp_trans.c lapacke_dtp_trans.c lapacke_stp_trans.c lapacke_ztp_trans.c +lapacke_ctr_nancheck.c lapacke_dtr_nancheck.c lapacke_str_nancheck.c lapacke_ztr_nancheck.c +lapacke_ctr_trans.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztr_trans.c +lapacke_ctz_nancheck.c lapacke_dtz_nancheck.c lapacke_stz_nancheck.c lapacke_ztz_nancheck.c +lapacke_ctz_trans.c lapacke_dtz_trans.c lapacke_stz_trans.c lapacke_ztz_trans.c + +lapacke_make_complex_float.c lapacke_make_complex_double.c +lapacke_lsame.c +lapacke_xerbla.c ) diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index adc573650..a1f863107 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -76,6 +76,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_ctp_trans.o \ lapacke_ctr_nancheck.o \ lapacke_ctr_trans.o \ + lapacke_ctz_nancheck.o \ + lapacke_ctz_trans.o \ lapacke_dgb_nancheck.o \ lapacke_dgb_trans.o \ lapacke_dge_nancheck.o \ @@ -110,6 +112,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_dtp_trans.o \ lapacke_dtr_nancheck.o \ lapacke_dtr_trans.o \ + lapacke_dtz_nancheck.o \ + lapacke_dtz_trans.o \ lapacke_lsame.o \ lapacke_sgb_nancheck.o \ lapacke_sgb_trans.o \ @@ -145,6 +149,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_stp_trans.o \ lapacke_str_nancheck.o \ lapacke_str_trans.o \ + lapacke_stz_nancheck.o \ + lapacke_stz_trans.o \ lapacke_xerbla.o \ lapacke_zgb_nancheck.o \ lapacke_zgb_trans.o \ @@ -184,6 +190,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_ztp_trans.o \ lapacke_ztr_nancheck.o \ lapacke_ztr_trans.o \ + lapacke_ztz_nancheck.o \ + lapacke_ztz_trans.o \ lapacke_make_complex_float.o \ lapacke_make_complex_double.o diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c new file mode 100644 index 000000000..bea956781 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c @@ -0,0 +1,144 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *a, + lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_cge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c new file mode 100644 index 000000000..48d346611 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *in, lapack_int ldin, + lapack_complex_float *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_cge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c new file mode 100644 index 000000000..cd2ae6731 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c @@ -0,0 +1,143 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *a, lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_dge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda ) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c new file mode 100644 index 000000000..b39000d42 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *in, lapack_int ldin, + double *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_dge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c new file mode 100644 index 000000000..7d7c30f96 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c @@ -0,0 +1,143 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *a, lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_sge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_str_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c new file mode 100644 index 000000000..cffee6c98 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *in, lapack_int ldin, + float *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_sge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c new file mode 100644 index 000000000..481fa4c03 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c @@ -0,0 +1,144 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *a, + lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_zge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c new file mode 100644 index 000000000..faef6da50 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *in, lapack_int ldin, + lapack_complex_double *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_zge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 03d15c23c..49798b0c5 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -207,7 +207,7 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o + sgesvdq.o slarmm.o slatrs3.o strsyl3.o sgelst.o endif @@ -316,7 +316,7 @@ CLASRC_O = \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ - cgesvdq.o + cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o endif ifdef USEXBLAS @@ -417,7 +417,7 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o + dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o dgelst.o endif ifdef USEXBLAS @@ -526,7 +526,7 @@ ZLASRC_O = \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ - zgesvdq.o + zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o endif ifdef USEXBLAS diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f index 369ed1983..46eaf33b9 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f index be5720f4f..55cab8b23 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f index bff973214..d2ad13ced 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f index 79e86b41b..623b88a8a 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f @@ -81,7 +81,8 @@ C> \verbatim C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ C> LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ C> INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGEQRF', -INFO ) @@ -252,7 +260,6 @@ C> * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/cgebak.f b/lapack-netlib/SRC/cgebak.f index 201dbfcec..4348d5ea4 100644 --- a/lapack-netlib/SRC/cgebak.f +++ b/lapack-netlib/SRC/cgebak.f @@ -238,7 +238,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -252,7 +252,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f index 359fa2afe..71acfdba3 100644 --- a/lapack-netlib/SRC/cgees.f +++ b/lapack-netlib/SRC/cgees.f @@ -282,7 +282,7 @@ * CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = REAL( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f index 1113563ba..782e36747 100644 --- a/lapack-netlib/SRC/cgeesx.f +++ b/lapack-netlib/SRC/cgeesx.f @@ -337,7 +337,7 @@ * CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = REAL( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 25ab81302..e37b25b6b 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -704,11 +704,11 @@ IF ( LQUERY ) THEN CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_CGEQP3 = REAL( CDUMMY(1) ) + LWRK_CGEQP3 = INT( CDUMMY(1) ) CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_CGEQRF = REAL( CDUMMY(1) ) + LWRK_CGEQRF = INT( CDUMMY(1) ) CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_CGELQF = REAL( CDUMMY(1) ) + LWRK_CGELQF = INT( CDUMMY(1) ) END IF MINWRK = 2 OPTWRK = 2 @@ -724,7 +724,7 @@ IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, $ N+LWRK_CGEQRF, LWRK_CGESVJ ) @@ -760,10 +760,10 @@ IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMLQ = REAL( CDUMMY(1) ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, $ N+LWRK_CGELQF, 2*N+LWRK_CGEQRF, @@ -799,10 +799,10 @@ IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF, $ LWRK_CGESVJ, LWRK_CUNMQRM ) @@ -861,26 +861,26 @@ IF ( LQUERY ) THEN CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQR = REAL( CDUMMY(1) ) + LWRK_CUNMQR = INT( CDUMMY(1) ) IF ( .NOT. JRACC ) THEN CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_CGEQP3N = REAL( CDUMMY(1) ) + LWRK_CGEQP3N = INT( CDUMMY(1) ) CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJU = REAL( CDUMMY(1) ) + LWRK_CGESVJU = INT( CDUMMY(1) ) CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJV = REAL( CDUMMY(1) ) + LWRK_CGESVJV = INT( CDUMMY(1) ) CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMLQ = REAL( CDUMMY(1) ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, $ 2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, @@ -909,13 +909,13 @@ ELSE CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJV = REAL( CDUMMY(1) ) + LWRK_CGESVJV = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMQR = REAL( CDUMMY(1) ) + LWRK_CUNMQR = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, $ 2*N+LWRK_CGEQRF, 2*N+N**2, diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f index 04defbb2e..da6b9092f 100644 --- a/lapack-netlib/SRC/cgelss.f +++ b/lapack-netlib/SRC/cgelss.f @@ -266,11 +266,11 @@ * * Compute space needed for CGEQRF CALL CGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_CGEQRF = REAL( DUM(1) ) + LWORK_CGEQRF = INT( DUM(1) ) * Compute space needed for CUNMQR CALL CUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_CUNMQR = REAL( DUM(1) ) + LWORK_CUNMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'CGEQRF', ' ', M, $ N, -1, -1 ) ) @@ -284,15 +284,15 @@ * Compute space needed for CGEBRD CALL CGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1), $ -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 2*N + LWORK_CGEBRD ) MAXWRK = MAX( MAXWRK, 2*N + LWORK_CUNMBR ) @@ -310,23 +310,23 @@ * Compute space needed for CGELQF CALL CGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_CGELQF = REAL( DUM(1) ) + LWORK_CGELQF = INT( DUM(1) ) * Compute space needed for CGEBRD CALL CGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) * Compute space needed for CUNMLQ CALL CUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMLQ = REAL( DUM(1) ) + LWORK_CUNMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_CGELQF MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_CGEBRD ) @@ -345,15 +345,15 @@ * Compute space needed for CGEBRD CALL CGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) MAXWRK = 2*M + LWORK_CGEBRD MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNMBR ) MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNGBR ) diff --git a/lapack-netlib/SRC/cgelst.c b/lapack-netlib/SRC/cgelst.c new file mode 100644 index 000000000..48ded643d --- /dev/null +++ b/lapack-netlib/SRC/cgelst.c @@ -0,0 +1,1108 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download CGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */ +/* > or LQ factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'C' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'C' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'C': the linear system involves A**H. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by CGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by CGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'C'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > modulus of elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of the modulus of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complexGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int cgelst_(char *trans, integer *m, integer *n, integer * + nrhs, complex *a, integer *lda, complex *b, integer *ldb, complex * + work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; + real r__1; + + /* Local variables */ + real anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + real rwork[1]; + integer lwopt, nb; + extern /* Subroutine */ int slabad_(real *, real *); + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + integer mn; + extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + extern real slamch_(char *); + extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *), xerbla_(char *, + integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern /* Subroutine */ int cgelqt_(integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, integer *); + integer scllen; + real bignum; + extern /* Subroutine */ int cgeqrt_(integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, integer *); + integer mnnrhs; + real smlnum; + logical lquery; + extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *), cgemlqt_(char *, char *, integer *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *), cgemqrt_(char *, char *, integer *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "C"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "CGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "CGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = slamch_("S") / slamch_("P"); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = clange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = clange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + clascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + clascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + cgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + ctrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + ctrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0.f, b[i__3].i = 0.f; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + cgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + ctrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0.f, b[i__3].i = 0.f; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + ctrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + clascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + clascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + clascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + clascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + + return 0; + +/* End of CGELST */ + +} /* cgelst_ */ + diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f new file mode 100644 index 000000000..7d8e44ddf --- /dev/null +++ b/lapack-netlib/SRC/cgelst.f @@ -0,0 +1,533 @@ +*> \brief CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR +*> or LQ factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'C' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'C' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'C': the linear system involves A**H. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by CGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by CGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'C'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> modulus of elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of the modulus of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + REAL ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + REAL RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD, + $ CLASCL, CLASET, CTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'CGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = REAL( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'CGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL SLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = CLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = CLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL CLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL CLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL CGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL CTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL CTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit', + $ N, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL CGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL CTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL CTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit', + $ M, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL CLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = REAL( LWOPT ) +* + RETURN +* +* End of CGELST +* + END diff --git a/lapack-netlib/SRC/cggbak.f b/lapack-netlib/SRC/cggbak.f index e8ac34805..159449601 100644 --- a/lapack-netlib/SRC/cggbak.f +++ b/lapack-netlib/SRC/cggbak.f @@ -253,7 +253,7 @@ IF( ILO.EQ.1 ) $ GO TO 50 DO 40 I = ILO - 1, 1, -1 - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -263,7 +263,7 @@ IF( IHI.EQ.N ) $ GO TO 70 DO 60 I = IHI + 1, N - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 60 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -277,7 +277,7 @@ IF( ILO.EQ.1 ) $ GO TO 90 DO 80 I = ILO - 1, 1, -1 - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 80 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -287,7 +287,7 @@ IF( IHI.EQ.N ) $ GO TO 110 DO 100 I = IHI + 1, N - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 100 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/cggbal.f b/lapack-netlib/SRC/cggbal.f index c7a232415..66ba7a881 100644 --- a/lapack-netlib/SRC/cggbal.f +++ b/lapack-netlib/SRC/cggbal.f @@ -535,7 +535,7 @@ IRAB = ICAMAX( N-ILO+1, B( I, ILO ), LDB ) RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) ) LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE ) - IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) + IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) ) IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB ) LSCALE( I ) = SCLFAC**IR ICAB = ICAMAX( IHI, A( 1, I ), 1 ) @@ -543,7 +543,7 @@ ICAB = ICAMAX( IHI, B( 1, I ), 1 ) CAB = MAX( CAB, ABS( B( ICAB, I ) ) ) LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE ) - JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) + JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) ) JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB ) RSCALE( I ) = SCLFAC**JC 360 CONTINUE diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index 3efca1e71..fb384b651 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -289,7 +289,7 @@ * CALL CGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = REAL( WORK( M+NP+1 ) ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**H*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 76d7de4ce..1074b4828 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -511,7 +511,7 @@ * IF( JJ.GT.0 ) THEN DO I = JJ, 1, -1 - C = DBLE( A( J+1+I, J ) ) + C = REAL( A( J+1+I, J ) ) CALL CROT( IHI-TOP, A( TOP+1, J+I+1 ), 1, $ A( TOP+1, J+I ), 1, C, $ -CONJG( B( J+1+I, J ) ) ) diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f index 4785941db..cca20dfed 100644 --- a/lapack-netlib/SRC/cgglse.f +++ b/lapack-netlib/SRC/cgglse.f @@ -276,7 +276,7 @@ * CALL CGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = REAL( WORK( P+MN+1 ) ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**H *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index febd9be8d..0185f4e0d 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL CGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = REAL( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**H*B. * diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index b43febc1f..5227100da 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL CGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = REAL( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**H * diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index 9a4a1efb7..2ddf74b98 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -284,7 +284,7 @@ LIWMIN = 1 END IF LOPT = MAX( LWMIN, N + - $ ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) ) LROPT = LRWMIN LIOPT = LIWMIN END IF diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f index 0c708190c..4b7f43d52 100644 --- a/lapack-netlib/SRC/chegvd.f +++ b/lapack-netlib/SRC/chegvd.f @@ -360,9 +360,9 @@ CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL CHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, INFO ) - LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) - LROPT = MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) ) - LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) + LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) ) + LROPT = INT( MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) ) ) + LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f index a659c8e79..e123fa299 100644 --- a/lapack-netlib/SRC/chesv_rk.f +++ b/lapack-netlib/SRC/chesv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 8c1d62a87..50c6827ff 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -523,9 +523,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = CZERO GO TO 50 END IF @@ -551,10 +549,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = CZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f index 754be31ed..65d08b783 100644 --- a/lapack-netlib/SRC/chpgvd.f +++ b/lapack-netlib/SRC/chpgvd.f @@ -335,9 +335,9 @@ CALL CHPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, INFO ) - LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) - LRWMIN = MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) ) - LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) + LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) ) + LRWMIN = INT( MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) ) ) + LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/claed0.c b/lapack-netlib/SRC/claed0.c index 21e408397..2b696508e 100644 --- a/lapack-netlib/SRC/claed0.c +++ b/lapack-netlib/SRC/claed0.c @@ -796,10 +796,10 @@ L10: temp = log((real) (*n)) / log(2.f); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/claed7.c b/lapack-netlib/SRC/claed7.c index 49fc9ed4b..1eaa7e9c2 100644 --- a/lapack-netlib/SRC/claed7.c +++ b/lapack-netlib/SRC/claed7.c @@ -864,11 +864,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/clalsa.c b/lapack-netlib/SRC/clalsa.c index 4bc3830a9..2ef3e1231 100644 --- a/lapack-netlib/SRC/clalsa.c +++ b/lapack-netlib/SRC/clalsa.c @@ -1051,7 +1051,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -1065,7 +1065,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1110,7 +1110,7 @@ L170: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index 95cc33b9d..0a01cc226 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -279,7 +279,7 @@ PARAMETER ( RZERO = 0.0e0, RONE = 1.0e0 ) * .. * .. Local Scalars .. - COMPLEX ALPHA, BETA, CDUM, REFSUM + COMPLEX ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*CONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ ELSE JBOT = KBOT END IF + T1 = CONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ CONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -610,25 +611,28 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + CONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +692,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )* - $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + CONJG( V( 2, M ) )* + $ H( K+2, J ) + CONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +718,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +737,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f index 2284fd65d..9cc25c6dc 100644 --- a/lapack-netlib/SRC/claqz0.f +++ b/lapack-netlib/SRC/claqz0.f @@ -299,7 +299,7 @@ PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, BNORM, BTOL COMPLEX :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -312,7 +312,7 @@ * External Functions EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD, $ CLARTG, CROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, CLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -466,6 +466,9 @@ ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = CLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -528,15 +531,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/clarscl2.f b/lapack-netlib/SRC/clarscl2.f index 26b028dbb..f4e68523b 100644 --- a/lapack-netlib/SRC/clarscl2.f +++ b/lapack-netlib/SRC/clarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> CLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> CLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the REAL diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/clartg.f90 b/lapack-netlib/SRC/clartg.f90 index 13a629a34..6231f8520 100644 --- a/lapack-netlib/SRC/clartg.f90 +++ b/lapack-netlib/SRC/clartg.f90 @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,19 +38,20 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be -!> identical to those returned by CLARTG. +!> identical to those returned by SLARTG. !> !> The algorithm used to compute these quantities incorporates scaling !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine CROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine CROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -117,7 +115,7 @@ subroutine CLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, two=>stwo, czero, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +127,7 @@ subroutine CLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +139,9 @@ subroutine CLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -149,30 +150,43 @@ subroutine CLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -181,32 +195,51 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,19 +247,43 @@ subroutine CLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return diff --git a/lapack-netlib/SRC/clascl.f b/lapack-netlib/SRC/clascl.f index 399af23a4..f9aace0bc 100644 --- a/lapack-netlib/SRC/clascl.f +++ b/lapack-netlib/SRC/clascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/clascl2.f b/lapack-netlib/SRC/clascl2.f index 2ae27975c..882273b5e 100644 --- a/lapack-netlib/SRC/clascl2.f +++ b/lapack-netlib/SRC/clascl2.f @@ -1,4 +1,4 @@ -*> \brief \b CLASCL2 performs diagonal scaling on a vector. +*> \brief \b CLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,9 +34,9 @@ *> *> \verbatim *> -*> CLASCL2 performs a diagonal scaling on a vector: +*> CLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x -*> where the diagonal REAL matrix D is stored as a vector. +*> where the diagonal REAL matrix D is stored as a matrix. *> *> Eventually to be replaced by BLAS_cge_diag_scale in the new BLAS *> standard. @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/clatbs.f b/lapack-netlib/SRC/clatbs.f index 606f963d3..97abcadce 100644 --- a/lapack-netlib/SRC/clatbs.f +++ b/lapack-netlib/SRC/clatbs.f @@ -278,7 +278,7 @@ $ CDOTU, CLADIV * .. * .. External Subroutines .. - EXTERNAL CAXPY, CSSCAL, CTBSV, SLABAD, SSCAL, XERBLA + EXTERNAL CAXPY, CSSCAL, CTBSV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -324,17 +324,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = SLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / SLAMCH( 'Precision' ) + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/clatrs.f b/lapack-netlib/SRC/clatrs.f index 946ab8068..91334b706 100644 --- a/lapack-netlib/SRC/clatrs.f +++ b/lapack-netlib/SRC/clatrs.f @@ -274,7 +274,7 @@ $ CDOTU, CLADIV * .. * .. External Subroutines .. - EXTERNAL CAXPY, CSSCAL, CTRSV, SLABAD, SSCAL, XERBLA + EXTERNAL CAXPY, CSSCAL, CTRSV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -318,17 +318,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = SLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / SLAMCH( 'Precision' ) + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -360,8 +357,74 @@ IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE - TSCAL = HALF / ( SMLNUM*TMAX ) - CALL SSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.SLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = HALF / ( SMLNUM*TMAX ) + CALL SSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be +* represented as a floating-point number. Find the +* maximum offdiagonal absolute value +* max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is +* not +/- Infinity, use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + DO I = 1, J - 1 + TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ), + $ ABS( AIMAG(A ( I, J ) ) ) ) + END DO + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + DO I = J + 1, N + TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ), + $ ABS( AIMAG(A ( I, J ) ) ) ) + END DO + END DO + END IF +* + IF( TMAX.LE.SLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm of each column without +* introducing Infinity in the summation. + TSCAL = TWO * TSCAL + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + END IF + TSCAL = TSCAL * HALF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point +* entry. Rely on TRSV to propagate Inf and NaN. + CALL CTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c new file mode 100644 index 000000000..f6d76cf49 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.c @@ -0,0 +1,1282 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int clatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, complex *a, integer *lda, complex * + x, integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + complex q__1; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + real rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + integer lscale; + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + integer *, complex *, integer *, complex *, real *, real *, + integer *); + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + clatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = clange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + csscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = clange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + csscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + csscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__6, &i__7, &i__8, &q__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("T", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of CLATRS3 */ + +} /* clatrs3_ */ + diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f new file mode 100644 index 000000000..a902f1ed0 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.f @@ -0,0 +1,666 @@ +*> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( LDX, * ) + REAL CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CLATRS, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of CLATRS3 +* + END diff --git a/lapack-netlib/SRC/cstedc.c b/lapack-netlib/SRC/cstedc.c index 437c39e96..8f047d1ce 100644 --- a/lapack-netlib/SRC/cstedc.c +++ b/lapack-netlib/SRC/cstedc.c @@ -836,10 +836,10 @@ f"> */ lrwmin = *n - 1 << 1; } else if (icompz == 1) { lgn = (integer) (log((real) (*n)) / log(2.f)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } lwmin = *n * *n; diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f index 6f175e381..4ddabf62f 100644 --- a/lapack-netlib/SRC/csysv.f +++ b/lapack-netlib/SRC/csysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f index 793e39df5..ef5334dcd 100644 --- a/lapack-netlib/SRC/csysv_rk.f +++ b/lapack-netlib/SRC/csysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f index daa9f27c4..aad594e21 100644 --- a/lapack-netlib/SRC/csysv_rook.f +++ b/lapack-netlib/SRC/csysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csyswapr.f b/lapack-netlib/SRC/csyswapr.f index 185d81922..04004f3c1 100644 --- a/lapack-netlib/SRC/csyswapr.f +++ b/lapack-netlib/SRC/csyswapr.f @@ -58,15 +58,13 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by CSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -116,7 +114,6 @@ * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I COMPLEX TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL CSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL CSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL CSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL CSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE CSYSWAPR diff --git a/lapack-netlib/SRC/ctprfb.f b/lapack-netlib/SRC/ctprfb.f index 11496180f..6cd5f05bd 100644 --- a/lapack-netlib/SRC/ctprfb.f +++ b/lapack-netlib/SRC/ctprfb.f @@ -1,4 +1,4 @@ -*> \brief \b CTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b CTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c new file mode 100644 index 000000000..3c119157c --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.c @@ -0,0 +1,2022 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complexSYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int ctrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, complex *a, integer *lda, complex *b, integer + *ldb, complex *c__, integer *ldc, real *scale, real *swork, integer * + ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3, r__4; + complex q__1; + + /* Local variables */ + real scal; + complex csgn; + real anrm, bnrm, cnrm; + integer awrk, bwrk; + real *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(real *); + integer nb, jj, ll; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + extern /* Subroutine */ int ctrsyl_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *, real *, integer *); + logical lquery; + integer nba, nbb; + real buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (real*)malloc(f2cmax(*m,*n)*sizeof(real)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ctrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + q__1.r = sgn, q__1.i = 0.f; + csgn.r = q__1.r, csgn.i = q__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = clange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = clange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__4, &i__5, &i__6, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = clange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__4, &i__5, &i__6, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = clange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + csscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__3, &i__4, &i__5, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = clange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = clange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__2, &i__3, &i__4, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + r__3 = (r__1 = c__[i__1].r, abs(r__1)), r__4 = (r__2 = r_imag(&c__[ + c_dim1 + 1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + r__3 = scal, r__4 = (r__1 = c__[i__3].r, abs(r__1)), r__3 = + f2cmax(r__3,r__4), r__4 = (r__2 = r_imag(&c__[k + l * + c_dim1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + clascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return 0; + +/* End of CTRSYL3 */ + +} /* ctrsyl3_ */ + diff --git a/lapack-netlib/SRC/ctrsyl3.f b/lapack-netlib/SRC/ctrsyl3.f new file mode 100644 index 000000000..586dc0207 --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b CTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> CTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complexSYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + REAL SCALE +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) + REAL SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX CSGN +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL CLANGE, SLAMCH, SLARMM + EXTERNAL CLANGE, ILAENV, LSAME, SLAMCH, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CSSCAL, CGEMM, CLASCL, CTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'CTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) + CSGN = CMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( REAL( C( 1, 1 ) ) ), + $ ABS( AIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( REAL ( C( K, L ) ) ), + $ ABS( AIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL CLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of CTRSYL3 +* + END diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f index db238f925..b45db6100 100644 --- a/lapack-netlib/SRC/cunbdb2.f +++ b/lapack-netlib/SRC/cunbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX array, dimension (P) +*> TAUP1 is COMPLEX array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX array, dimension (M-P) +*> TAUP2 is COMPLEX array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f index e6afd89c3..117f23d08 100644 --- a/lapack-netlib/SRC/cunbdb4.f +++ b/lapack-netlib/SRC/cunbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX array, dimension (P) +*> TAUP1 is COMPLEX array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX array, dimension (M-P) +*> TAUP2 is COMPLEX array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f index 7acc99cb8..b93a389d6 100644 --- a/lapack-netlib/SRC/cunbdb6.f +++ b/lapack-netlib/SRC/cunbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,16 +173,19 @@ * ===================================================================== * * .. Parameters .. - REAL ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01E0, REALONE = 1.0E0, + REAL ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) COMPLEX NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0), $ ZERO = (0.0E0,0.0E0) ) * .. * .. Local Scalars .. - INTEGER I - REAL NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + REAL EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + REAL SLAMCH * .. * .. External Subroutines .. EXTERNAL CGEMV, CLASSQ, XERBLA @@ -211,17 +220,17 @@ CALL XERBLA( 'CUNBDB6', -INFO ) RETURN END IF +* + EPS = SLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -239,27 +248,31 @@ CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -281,24 +294,22 @@ CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -307,4 +318,3 @@ * End of CUNBDB6 * END - diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index c973d0b0a..a31a53d79 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -233,7 +233,7 @@ END IF END IF END IF - LWKOPT = REAL( WORK( 1 ) ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/dgebak.f b/lapack-netlib/SRC/dgebak.f index e978d7af2..9c086794a 100644 --- a/lapack-netlib/SRC/dgebak.f +++ b/lapack-netlib/SRC/dgebak.f @@ -236,7 +236,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -250,7 +250,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/dgees.f b/lapack-netlib/SRC/dgees.f index 82b9d6ee4..24739b1cf 100644 --- a/lapack-netlib/SRC/dgees.f +++ b/lapack-netlib/SRC/dgees.f @@ -302,7 +302,7 @@ * CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/dgeesx.f b/lapack-netlib/SRC/dgeesx.f index 08fbb6468..f3677fcb3 100644 --- a/lapack-netlib/SRC/dgeesx.f +++ b/lapack-netlib/SRC/dgeesx.f @@ -382,7 +382,7 @@ * CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f index 8ed703fcf..c4190f2e0 100644 --- a/lapack-netlib/SRC/dgelss.f +++ b/lapack-netlib/SRC/dgelss.f @@ -254,11 +254,11 @@ * * Compute space needed for DGEQRF CALL DGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_DGEQRF=DUM(1) + LWORK_DGEQRF = INT( DUM(1) ) * Compute space needed for DORMQR CALL DORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_DORMQR=DUM(1) + LWORK_DORMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + LWORK_DGEQRF ) MAXWRK = MAX( MAXWRK, N + LWORK_DORMQR ) @@ -273,15 +273,15 @@ * Compute space needed for DGEBRD CALL DGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 3*N + LWORK_DGEBRD ) MAXWRK = MAX( MAXWRK, 3*N + LWORK_DORMBR ) @@ -305,23 +305,23 @@ * Compute space needed for DGELQF CALL DGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_DGELQF=DUM(1) + LWORK_DGELQF = INT( DUM(1) ) * Compute space needed for DGEBRD CALL DGEBRD( M, M, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) * Compute space needed for DORMLQ CALL DORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_DORMLQ=DUM(1) + LWORK_DORMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_DGELQF MAXWRK = MAX( MAXWRK, M*M + 4*M + LWORK_DGEBRD ) @@ -341,15 +341,15 @@ * Compute space needed for DGEBRD CALL DGEBRD( M, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) MAXWRK = 3*M + LWORK_DGEBRD MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORMBR ) MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORGBR ) diff --git a/lapack-netlib/SRC/dgelst.c b/lapack-netlib/SRC/dgelst.c new file mode 100644 index 000000000..9327da4dd --- /dev/null +++ b/lapack-netlib/SRC/dgelst.c @@ -0,0 +1,1104 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download DGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */ +/* > factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'T' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'T' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'T': the linear system involves A**T. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by DGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by DGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'T'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int dgelst_(char *trans, integer *m, integer *n, integer * + nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; + + /* Local variables */ + doublereal anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + doublereal rwork[1]; + integer lwopt; + extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + integer nb; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + integer mn; + extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *), dlaset_(char *, integer *, integer + *, doublereal *, doublereal *, doublereal *, integer *), + xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + doublereal bignum; + extern /* Subroutine */ int dgelqt_(integer *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dgeqrt_(integer *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *, doublereal *, integer *); + integer mnnrhs; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *), dgemlqt_(char *, char *, + integer *, integer *, integer *, integer *, doublereal *, integer + *, doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dgemqrt_(char *, char *, integer *, + integer *, integer *, integer *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "T"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "DGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + work[1] = (doublereal) lwopt; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (doublereal) lwopt; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "DGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = dlamch_("S") / dlamch_("P"); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = dlange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0. && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + dlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + dlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (doublereal) lwopt; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = dlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0. && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + dlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + dlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + dgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + dtrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + dtrtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + dgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + dtrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + dtrtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + dlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + dlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + dlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + dlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + work[1] = (doublereal) lwopt; + + return 0; + +/* End of DGELST */ + +} /* dgelst_ */ + diff --git a/lapack-netlib/SRC/dgelst.f b/lapack-netlib/SRC/dgelst.f new file mode 100644 index 000000000..ca0e04a9b --- /dev/null +++ b/lapack-netlib/SRC/dgelst.f @@ -0,0 +1,531 @@ +*> \brief DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its transpose, using a QR or LQ +*> factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'T' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'T' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'T': the linear system involves A**T. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by DGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by DGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'T'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL LSAME, ILAENV, DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DGELQT, DGEQRT, DGEMLQT, DGEMQRT, DLABAD, + $ DLASCL, DLASET, DTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'DGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = DBLE( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'DGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL DLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = DLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = DLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL DLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL DLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL DGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ), + $ INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL DTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL DTRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL DGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL DTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL DTRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL DLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL DLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = DBLE( LWOPT ) +* + RETURN +* +* End of DGELST +* + END diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f index d43785d32..ae0f0e908 100644 --- a/lapack-netlib/SRC/dggglm.f +++ b/lapack-netlib/SRC/dggglm.f @@ -288,7 +288,7 @@ * CALL DGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = WORK( M+NP+1 ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**T*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/dgglse.f b/lapack-netlib/SRC/dgglse.f index 2fd17bbcb..28aeaf6e7 100644 --- a/lapack-netlib/SRC/dgglse.f +++ b/lapack-netlib/SRC/dgglse.f @@ -276,7 +276,7 @@ * CALL DGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = WORK( P+MN+1 ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**T *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 617af274f..39d27a5c9 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL DGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**T*B. * diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index 07f8752d8..ddf4104c5 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL DGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**T * diff --git a/lapack-netlib/SRC/dhgeqz.f b/lapack-netlib/SRC/dhgeqz.f index 3fe2a083c..b5a2917e3 100644 --- a/lapack-netlib/SRC/dhgeqz.f +++ b/lapack-netlib/SRC/dhgeqz.f @@ -536,9 +536,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = ZERO GO TO 70 END IF @@ -564,10 +562,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = ZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/dlaed0.c b/lapack-netlib/SRC/dlaed0.c index 95e39b0df..74e58dd2d 100644 --- a/lapack-netlib/SRC/dlaed0.c +++ b/lapack-netlib/SRC/dlaed0.c @@ -827,10 +827,10 @@ L10: temp = log((doublereal) (*n)) / log(2.); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/dlaed4.f b/lapack-netlib/SRC/dlaed4.f index 3ee3ef920..b51e23d85 100644 --- a/lapack-netlib/SRC/dlaed4.f +++ b/lapack-netlib/SRC/dlaed4.f @@ -328,9 +328,12 @@ IF( C.LT.ZERO ) $ C = ABS( C ) IF( C.EQ.ZERO ) THEN -* ETA = B/A +* ETA = B/A * ETA = RHO - TAU - ETA = DLTUB - TAU +* ETA = DLTUB - TAU +* +* Update proposed by Li, Ren-Cang: + ETA = -W / ( DPSI+DPHI ) ELSE IF( A.GE.ZERO ) THEN ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C ) ELSE diff --git a/lapack-netlib/SRC/dlaed7.c b/lapack-netlib/SRC/dlaed7.c index fd8515261..d23a72be0 100644 --- a/lapack-netlib/SRC/dlaed7.c +++ b/lapack-netlib/SRC/dlaed7.c @@ -885,11 +885,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/dlaeda.c b/lapack-netlib/SRC/dlaeda.c index f4bb214d3..202e1b636 100644 --- a/lapack-netlib/SRC/dlaeda.c +++ b/lapack-netlib/SRC/dlaeda.c @@ -754,7 +754,7 @@ f"> */ /* scheme */ i__1 = *curlvl - 1; - curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1; + curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1; /* Determine size of these matrices. We add HALF to the value of */ /* the SQRT in case the machine underestimates one of these square */ @@ -781,12 +781,12 @@ f"> */ /* rotations and permutation and then multiplying the center matrices */ /* against the current Z. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (k = 1; k <= i__1; ++k) { i__2 = *curlvl - k; i__3 = *curlvl - k - 1; - curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - + curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 1; psiz1 = prmptr[curr + 1] - prmptr[curr]; psiz2 = prmptr[curr + 2] - prmptr[curr + 1]; @@ -847,7 +847,7 @@ f"> */ c__1); i__2 = *tlvls - k; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L70: */ } diff --git a/lapack-netlib/SRC/dlag2s.f b/lapack-netlib/SRC/dlag2s.f index e5a930223..9e6dead49 100644 --- a/lapack-netlib/SRC/dlag2s.f +++ b/lapack-netlib/SRC/dlag2s.f @@ -34,8 +34,8 @@ *> *> \verbatim *> -*> DLAG2S converts a DOUBLE PRECISION matrix, SA, to a SINGLE -*> PRECISION matrix, A. +*> DLAG2S converts a DOUBLE PRECISION matrix, A, to a SINGLE +*> PRECISION matrix, SA. *> *> RMAX is the overflow for the SINGLE PRECISION arithmetic *> DLAG2S checks that all the entries of A are between -RMAX and @@ -128,6 +128,9 @@ REAL SLAMCH EXTERNAL SLAMCH * .. +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. * .. Executable Statements .. * RMAX = SLAMCH( 'O' ) @@ -137,7 +140,7 @@ INFO = 1 GO TO 30 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 10 CONTINUE 20 CONTINUE INFO = 0 diff --git a/lapack-netlib/SRC/dlalsa.c b/lapack-netlib/SRC/dlalsa.c index 891ed66a8..4d5c347c3 100644 --- a/lapack-netlib/SRC/dlalsa.c +++ b/lapack-netlib/SRC/dlalsa.c @@ -951,7 +951,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -965,7 +965,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1010,7 +1010,7 @@ L50: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index 0c63ab800..43b4ac72a 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -286,8 +286,8 @@ * .. * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -631,22 +635,25 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +713,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +739,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +758,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f index 1bf65fd60..5b0965406 100644 --- a/lapack-netlib/SRC/dlaqz0.f +++ b/lapack-netlib/SRC/dlaqz0.f @@ -322,7 +322,7 @@ * Local scalars DOUBLE PRECISION :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, - $ TEMP, SWAP + $ TEMP, SWAP, BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -334,7 +334,7 @@ * External Functions EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD, $ DLARTG, DROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -486,6 +486,9 @@ ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = DLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -562,15 +565,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/dlarmm.c b/lapack-netlib/SRC/dlarmm.c new file mode 100644 index 000000000..eec5d143a --- /dev/null +++ b/lapack-netlib/SRC/dlarmm.c @@ -0,0 +1,605 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLARMM */ + +/* Definition: */ +/* =========== */ + +/* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) */ + +/* DOUBLE PRECISION ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is DOUBLE PRECISION */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is DOUBLE PRECISION */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +doublereal dlarmm_(doublereal *anorm, doublereal *bnorm, doublereal *cnorm) +{ + /* System generated locals */ + doublereal ret_val; + + /* Local variables */ + extern doublereal dlamch_(char *); + doublereal bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = dlamch_("Safe minimum") / dlamch_("Precision"); + bignum = 1. / smlnum / 4.; + +/* Compute a scale factor. */ + + ret_val = 1.; + if (*bnorm <= 1.) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5 / *bnorm; + } + } + return ret_val; + +/* ==== End of DLARMM ==== */ + +} /* dlarmm_ */ + diff --git a/lapack-netlib/SRC/dlarmm.f b/lapack-netlib/SRC/dlarmm.f new file mode 100644 index 000000000..c36042009 --- /dev/null +++ b/lapack-netlib/SRC/dlarmm.f @@ -0,0 +1,99 @@ +*> \brief \b DLARMM +* +* Definition: +* =========== +* +* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> DLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is DOUBLE PRECISION +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is DOUBLE PRECISION +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + DOUBLE PRECISION ANORM, BNORM, CNORM +* .. Parameters .. + DOUBLE PRECISION ONE, HALF, FOUR + PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION BIGNUM, SMLNUM +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + DLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + DLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + DLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of DLARMM ==== +* + END diff --git a/lapack-netlib/SRC/dlarscl2.f b/lapack-netlib/SRC/dlarscl2.f index 2468e2702..cc4b9aa3c 100644 --- a/lapack-netlib/SRC/dlarscl2.f +++ b/lapack-netlib/SRC/dlarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> DLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> DLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is DOUBLE PRECISION array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlartg.f90 b/lapack-netlib/SRC/dlartg.f90 index ef8c6e386..b7049c32f 100644 --- a/lapack-netlib/SRC/dlartg.f90 +++ b/lapack-netlib/SRC/dlartg.f90 @@ -11,7 +11,7 @@ ! SUBROUTINE DLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C, F, G, R, S +! REAL(wp) C, F, G, R, S ! .. ! !> \par Purpose: @@ -45,8 +45,6 @@ !> floating point operations (saves work in DBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine DLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, half=>dhalf, one=>done, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine DLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) @@ -143,20 +145,18 @@ subroutine DLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r*u end if return end subroutine diff --git a/lapack-netlib/SRC/dlascl.f b/lapack-netlib/SRC/dlascl.f index 05ad1c4f3..0a4bf21ce 100644 --- a/lapack-netlib/SRC/dlascl.f +++ b/lapack-netlib/SRC/dlascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/dlascl2.f b/lapack-netlib/SRC/dlascl2.f index 901e43c49..568e296ad 100644 --- a/lapack-netlib/SRC/dlascl2.f +++ b/lapack-netlib/SRC/dlascl2.f @@ -1,4 +1,4 @@ -*> \brief \b DLASCL2 performs diagonal scaling on a vector. +*> \brief \b DLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> DLASCL2 performs a diagonal scaling on a vector: +*> DLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is DOUBLE PRECISION array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlasd0.c b/lapack-netlib/SRC/dlasd0.c index c702665b0..0f88527ef 100644 --- a/lapack-netlib/SRC/dlasd0.c +++ b/lapack-netlib/SRC/dlasd0.c @@ -824,7 +824,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/dlasda.c b/lapack-netlib/SRC/dlasda.c index 72f9d55f3..a9190f805 100644 --- a/lapack-netlib/SRC/dlasda.c +++ b/lapack-netlib/SRC/dlasda.c @@ -1027,7 +1027,7 @@ f"> */ /* Now conquer each subproblem bottom-up. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); for (lvl = nlvl; lvl >= 1; --lvl) { lvl2 = (lvl << 1) - 1; @@ -1039,7 +1039,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/dlat2s.f b/lapack-netlib/SRC/dlat2s.f index 3d00fe0a3..c926e9930 100644 --- a/lapack-netlib/SRC/dlat2s.f +++ b/lapack-netlib/SRC/dlat2s.f @@ -134,6 +134,9 @@ LOGICAL LSAME EXTERNAL SLAMCH, LSAME * .. +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. * .. Executable Statements .. * RMAX = SLAMCH( 'O' ) @@ -146,7 +149,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE @@ -157,7 +160,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 30 CONTINUE 40 CONTINUE END IF diff --git a/lapack-netlib/SRC/dlatbs.f b/lapack-netlib/SRC/dlatbs.f index 4b71d5399..6a812743b 100644 --- a/lapack-netlib/SRC/dlatbs.f +++ b/lapack-netlib/SRC/dlatbs.f @@ -310,6 +310,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -317,7 +318,6 @@ * SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f index 43f92911d..be156bee2 100644 --- a/lapack-netlib/SRC/dlatrs.f +++ b/lapack-netlib/SRC/dlatrs.f @@ -264,8 +264,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX - DOUBLE PRECISION DASUM, DDOT, DLAMCH - EXTERNAL LSAME, IDAMAX, DASUM, DDOT, DLAMCH + DOUBLE PRECISION DASUM, DDOT, DLAMCH, DLANGE + EXTERNAL LSAME, IDAMAX, DASUM, DDOT, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DAXPY, DSCAL, DTRSV, XERBLA @@ -304,6 +304,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -311,7 +312,6 @@ * SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -343,8 +343,67 @@ IF( TMAX.LE.BIGNUM ) THEN TSCAL = ONE ELSE - TSCAL = ONE / ( SMLNUM*TMAX ) - CALL DSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = ONE / ( SMLNUM*TMAX ) + CALL DSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be represented +* as floating-point number. Find the offdiagonal entry A( I, J ) +* with the largest absolute value. If this entry is not +/- Infinity, +* use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + $ TMAX ) + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1, + $ SUMJ ), TMAX ) + END DO + END IF +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm without introducing Infinity +* in the summation + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + END IF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point entry. +* Rely on TRSV to propagate Inf and NaN. + CALL DTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/dlatrs3.c b/lapack-netlib/SRC/dlatrs3.c new file mode 100644 index 000000000..46eca6379 --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.c @@ -0,0 +1,1265 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is DOUBLE PRECISION array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int dlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublereal *a, integer *lda, + doublereal *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + integer *); + doublereal w[64]; + extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, + integer *, doublereal *, integer *, doublereal *, doublereal *, + doublereal *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I+KK*LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + dlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = dlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + dscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = dlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + dscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + dscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of DLATRS3 */ + +} /* dlatrs3_ */ + diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f new file mode 100644 index 000000000..b4a98bc78 --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE, DLARMM + EXTERNAL DLAMCH, DLANGE, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLATRS, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks +* + NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I+KK*LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) +* + CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of DLATRS3 +* + END diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f index 64e4645bc..a0dacbb16 100644 --- a/lapack-netlib/SRC/dorbdb2.f +++ b/lapack-netlib/SRC/dorbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is DOUBLE PRECISION array, dimension (P) +*> TAUP1 is DOUBLE PRECISION array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is DOUBLE PRECISION array, dimension (M-P) +*> TAUP2 is DOUBLE PRECISION array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f index a09568415..08604be45 100644 --- a/lapack-netlib/SRC/dorbdb4.f +++ b/lapack-netlib/SRC/dorbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is DOUBLE PRECISION array, dimension (P) +*> TAUP1 is DOUBLE PRECISION array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is DOUBLE PRECISION array, dimension (M-P) +*> TAUP2 is DOUBLE PRECISION array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f index fac52f760..45c8ba8a2 100644 --- a/lapack-netlib/SRC/dorbdb6.f +++ b/lapack-netlib/SRC/dorbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,15 +173,18 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01D0, REALONE = 1.0D0, + DOUBLE PRECISION ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) DOUBLE PRECISION NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 ) * .. * .. Local Scalars .. - INTEGER I - DOUBLE PRECISION NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + DOUBLE PRECISION EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH * .. * .. External Subroutines .. EXTERNAL DGEMV, DLASSQ, XERBLA @@ -210,17 +219,17 @@ CALL XERBLA( 'DORBDB6', -INFO ) RETURN END IF +* + EPS = DLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -238,27 +247,31 @@ CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -280,24 +293,22 @@ CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -306,4 +317,3 @@ * End of DORBDB6 * END - diff --git a/lapack-netlib/SRC/dorcsd2by1.f b/lapack-netlib/SRC/dorcsd2by1.f index 06bf53db1..25fab0f33 100644 --- a/lapack-netlib/SRC/dorcsd2by1.f +++ b/lapack-netlib/SRC/dorcsd2by1.f @@ -580,7 +580,7 @@ * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( JOBV1T, 'N', JOBU1, JOBU2, 'T', M, Q, P, THETA, - $ WORK(IPHI), V1T, LDV1T, DUM2, 1, U1, LDU1, U2, + $ WORK(IPHI), V1T, LDV1T, DUM1, 1, U1, LDU1, U2, $ LDU2, WORK(IB11D), WORK(IB11E), WORK(IB12D), $ WORK(IB12E), WORK(IB21D), WORK(IB21E), $ WORK(IB22D), WORK(IB22E), WORK(IBBCSD), LBBCSD, @@ -635,7 +635,7 @@ * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( 'N', JOBV1T, JOBU2, JOBU1, 'T', M, M-Q, M-P, - $ THETA, WORK(IPHI), DUM2, 1, V1T, LDV1T, U2, + $ THETA, WORK(IPHI), DUM1, 1, V1T, LDV1T, U2, $ LDU2, U1, LDU1, WORK(IB11D), WORK(IB11E), $ WORK(IB12D), WORK(IB12E), WORK(IB21D), $ WORK(IB21E), WORK(IB22D), WORK(IB22E), @@ -706,7 +706,7 @@ * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( JOBU2, JOBU1, 'N', JOBV1T, 'N', M, M-P, M-Q, - $ THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM2, + $ THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM1, $ 1, V1T, LDV1T, WORK(IB11D), WORK(IB11E), $ WORK(IB12D), WORK(IB12E), WORK(IB21D), $ WORK(IB21E), WORK(IB22D), WORK(IB22E), diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f index 1b242ff97..7dfd03961 100644 --- a/lapack-netlib/SRC/dorgbr.f +++ b/lapack-netlib/SRC/dorgbr.f @@ -232,7 +232,7 @@ END IF END IF END IF - LWKOPT = WORK( 1 ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f index 556326388..df215ae1a 100644 --- a/lapack-netlib/SRC/dspgvd.f +++ b/lapack-netlib/SRC/dspgvd.f @@ -307,8 +307,8 @@ CALL DSPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL DSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) - LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) - LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) + LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) ) + LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/dstedc.c b/lapack-netlib/SRC/dstedc.c index ef2eeabe8..56511d6cf 100644 --- a/lapack-netlib/SRC/dstedc.c +++ b/lapack-netlib/SRC/dstedc.c @@ -806,10 +806,10 @@ f"> */ lwmin = *n - 1 << 1; } else { lgn = (integer) (log((doublereal) (*n)) / log(2.)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } if (icompz == 1) { diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index edbe896fe..eaaecd8d9 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -257,7 +257,7 @@ LWMIN = 2*N + 1 END IF LOPT = MAX( LWMIN, 2*N + - $ ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF WORK( 1 ) = LOPT diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f index 61134bedc..3b38665a7 100644 --- a/lapack-netlib/SRC/dsygvd.f +++ b/lapack-netlib/SRC/dsygvd.f @@ -330,8 +330,8 @@ CALL DSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL DSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK, $ INFO ) - LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) - LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) + LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) ) + LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/dsysv.f b/lapack-netlib/SRC/dsysv.f index a6305e13c..ed6629ad9 100644 --- a/lapack-netlib/SRC/dsysv.f +++ b/lapack-netlib/SRC/dsysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL DSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsysv_rk.f b/lapack-netlib/SRC/dsysv_rk.f index 05d8f7d3f..db8fd36dd 100644 --- a/lapack-netlib/SRC/dsysv_rk.f +++ b/lapack-netlib/SRC/dsysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL DSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsysv_rook.f b/lapack-netlib/SRC/dsysv_rook.f index 6ebb52eae..85f293309 100644 --- a/lapack-netlib/SRC/dsysv_rook.f +++ b/lapack-netlib/SRC/dsysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL DSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsyswapr.f b/lapack-netlib/SRC/dsyswapr.f index c60ccbefc..93f6195f2 100644 --- a/lapack-netlib/SRC/dsyswapr.f +++ b/lapack-netlib/SRC/dsyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is DOUBLE PRECISION array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by DSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is DOUBLE PRECISION array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, N ) + DOUBLE PRECISION A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I DOUBLE PRECISION TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL DSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL DSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL DSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL DSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE DSYSWAPR diff --git a/lapack-netlib/SRC/dtprfb.f b/lapack-netlib/SRC/dtprfb.f index a3fc7d6c6..c015075b3 100644 --- a/lapack-netlib/SRC/dtprfb.f +++ b/lapack-netlib/SRC/dtprfb.f @@ -1,4 +1,4 @@ -*> \brief \b DTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b DTPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c new file mode 100644 index 000000000..9cfbe3dab --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.c @@ -0,0 +1,2060 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DTRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by DHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is DOUBLE PRECISION array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int dtrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublereal *a, integer *lda, doublereal *b, + integer *ldb, doublereal *c__, integer *ldc, doublereal *scale, + integer *iwork, integer *liwork, doublereal *swork, integer *ldswork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3; + + /* Local variables */ + doublereal scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + integer *), dgemm_(char *, char *, integer *, integer *, integer * + , doublereal *, doublereal *, integer *, doublereal *, integer *, + doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(doublereal *); + integer l2, nb, pc, jj, ll; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int dtrsyl_(char *, char *, integer *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + dtrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0. && a[l + 1 + l * a_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0. && b[l + 1 + l * b_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = dlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = dlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + dgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = dlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__4, &i__5, &i__6, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = dlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + dscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + dgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = dlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = dlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__2, &i__3, &i__4, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + d__2 = scal, d__3 = (d__1 = c__[k + l * c_dim1], abs(d__1)); + scal = f2cmax(d__2,d__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + dlascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return 0; + +/* End of DTRSYL3 */ + +} /* dtrsyl3_ */ + diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f new file mode 100644 index 000000000..c44ec3808 --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.f @@ -0,0 +1,1241 @@ +*> \brief \b DTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> DTRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by DHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLANGE, DLAMCH, DLARMM + EXTERNAL DLANGE, DLAMCH, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + SWORK( K, L ) = SCALOC * SWORK( K, L ) + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of DTRSYL3 +* + END diff --git a/lapack-netlib/SRC/ieeeck.f b/lapack-netlib/SRC/ieeeck.f index 74065c3b4..f9f6332ec 100644 --- a/lapack-netlib/SRC/ieeeck.f +++ b/lapack-netlib/SRC/ieeeck.f @@ -41,7 +41,7 @@ *> \param[in] ISPEC *> \verbatim *> ISPEC is INTEGER -*> Specifies whether to test just for inifinity arithmetic +*> Specifies whether to test just for infinity arithmetic *> or whether to test for infinity and NaN arithmetic. *> = 0: Verify infinity arithmetic only. *> = 1: Verify infinity and NaN arithmetic. diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index af2850398..a639e0375 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -469,6 +469,15 @@ ELSE NB = 64 END IF + ELSE IF( C3.EQ.'SYL' ) THEN +* The upper bound is to prevent overly aggressive scaling. + IF( SNAME ) THEN + NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), + $ 240 ) + ELSE + NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ), + $ 80 ) + END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN @@ -477,6 +486,12 @@ ELSE NB = 64 END IF + ELSE IF( C3.EQ.'TRS' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN IF( C3.EQ.'EBZ' ) THEN diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F index c153eef22..c701c2be0 100644 --- a/lapack-netlib/SRC/iparam2stage.F +++ b/lapack-netlib/SRC/iparam2stage.F @@ -178,7 +178,8 @@ * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + LOGICAL LSAME + EXTERNAL ILAENV, LSAME * .. * .. Executable Statements .. * @@ -310,7 +311,7 @@ * * Will add the VECT OPTION HERE next release VECT = OPTS(1:1) - IF( VECT.EQ.'N' ) THEN + IF( LSAME( VECT, 'N' ) ) THEN LHOUS = MAX( 1, 4*NI ) ELSE * This is not correct, it need to call the ALGO and the stage2 diff --git a/lapack-netlib/SRC/sgebak.f b/lapack-netlib/SRC/sgebak.f index b51b611a9..abb7809a3 100644 --- a/lapack-netlib/SRC/sgebak.f +++ b/lapack-netlib/SRC/sgebak.f @@ -236,7 +236,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -250,7 +250,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f index d40503f89..6febd549c 100644 --- a/lapack-netlib/SRC/sgees.f +++ b/lapack-netlib/SRC/sgees.f @@ -302,7 +302,7 @@ * CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f index 27c4338d4..6810fe7c8 100644 --- a/lapack-netlib/SRC/sgeesx.f +++ b/lapack-netlib/SRC/sgeesx.f @@ -382,7 +382,7 @@ * CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index be9e2ea11..9aed4329f 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -253,11 +253,11 @@ * * Compute space needed for SGEQRF CALL SGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_SGEQRF=DUM(1) + LWORK_SGEQRF = INT( DUM(1) ) * Compute space needed for SORMQR CALL SORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_SORMQR=DUM(1) + LWORK_SORMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + LWORK_SGEQRF ) MAXWRK = MAX( MAXWRK, N + LWORK_SORMQR ) @@ -272,15 +272,15 @@ * Compute space needed for SGEBRD CALL SGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 3*N + LWORK_SGEBRD ) MAXWRK = MAX( MAXWRK, 3*N + LWORK_SORMBR ) @@ -304,19 +304,19 @@ * Compute space needed for SGEBRD CALL SGEBRD( M, M, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) * Compute space needed for SORMLQ CALL SORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_SORMLQ=DUM(1) + LWORK_SORMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + M*ILAENV( 1, 'SGELQF', ' ', M, N, -1, $ -1 ) @@ -337,15 +337,15 @@ * Compute space needed for SGEBRD CALL SGEBRD( M, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) MAXWRK = 3*M + LWORK_SGEBRD MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORMBR ) MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORGBR ) diff --git a/lapack-netlib/SRC/sgelst.c b/lapack-netlib/SRC/sgelst.c new file mode 100644 index 000000000..e0cd84cd9 --- /dev/null +++ b/lapack-netlib/SRC/sgelst.c @@ -0,0 +1,1099 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download SGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* REAL A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */ +/* > factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'T' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'T' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'T': the linear system involves A**T. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by SGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by SGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'T'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup realGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int sgelst_(char *trans, integer *m, integer *n, integer * + nrhs, real *a, integer *lda, real *b, integer *ldb, real *work, + integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; + + /* Local variables */ + real anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + real rwork[1]; + integer lwopt, nb; + extern /* Subroutine */ int slabad_(real *, real *); + integer mn; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + real bignum; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, + real *, integer *), sgelqt_(integer *, integer *, integer + *, real *, integer *, real *, integer *, real *, integer *); + integer mnnrhs; + extern /* Subroutine */ int sgeqrt_(integer *, integer *, integer *, real + *, integer *, real *, integer *, real *, integer *); + real smlnum; + logical lquery; + extern /* Subroutine */ int strtrs_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *), sgemlqt_(char *, char *, integer *, + integer *, integer *, integer *, real *, integer *, real *, + integer *, real *, integer *, real *, integer *), + sgemqrt_(char *, char *, integer *, integer *, integer *, integer + *, real *, integer *, real *, integer *, real *, integer *, real * + , integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "T"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "SGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + work[1] = (real) lwopt; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (real) lwopt; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "SGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = slamch_("S") / slamch_("P"); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = slange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (real) lwopt; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = slange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + slascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + slascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + sgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + strtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + strtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.f; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + sgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + strtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.f; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + strtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + slascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + slascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + slascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + slascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + work[1] = (real) lwopt; + + return 0; + +/* End of SGELST */ + +} /* sgelst_ */ + diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f new file mode 100644 index 000000000..5377bc720 --- /dev/null +++ b/lapack-netlib/SRC/sgelst.f @@ -0,0 +1,531 @@ +*> \brief SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its transpose, using a QR or LQ +*> factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'T' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'T' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'T': the linear system involves A**T. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by SGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by SGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'T'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup realGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + REAL ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + REAL RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD, + $ SLASCL, SLASET, STRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'SGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = REAL( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'SGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL SLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = SLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = SLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL SLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL SLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL SGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ), + $ INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL STRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL STRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL SGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL STRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL STRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL SLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL SLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = REAL( LWOPT ) +* + RETURN +* +* End of SGELST +* + END diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f index f47d8bf32..b24615f7a 100644 --- a/lapack-netlib/SRC/sgeqrf.f +++ b/lapack-netlib/SRC/sgeqrf.f @@ -204,7 +204,7 @@ END IF * * Quick return if possible -* +* IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/sggbak.f b/lapack-netlib/SRC/sggbak.f index bb7f36011..8a796fdb1 100644 --- a/lapack-netlib/SRC/sggbak.f +++ b/lapack-netlib/SRC/sggbak.f @@ -252,7 +252,7 @@ $ GO TO 50 * DO 40 I = ILO - 1, 1, -1 - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -262,7 +262,7 @@ IF( IHI.EQ.N ) $ GO TO 70 DO 60 I = IHI + 1, N - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 60 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -276,7 +276,7 @@ IF( ILO.EQ.1 ) $ GO TO 90 DO 80 I = ILO - 1, 1, -1 - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 80 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -286,7 +286,7 @@ IF( IHI.EQ.N ) $ GO TO 110 DO 100 I = IHI + 1, N - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 100 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/sggbal.f b/lapack-netlib/SRC/sggbal.f index 6cfdbcdba..d7a8ef16c 100644 --- a/lapack-netlib/SRC/sggbal.f +++ b/lapack-netlib/SRC/sggbal.f @@ -522,7 +522,7 @@ IRAB = ISAMAX( N-ILO+1, B( I, ILO ), LDB ) RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) ) LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE ) - IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) + IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) ) IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB ) LSCALE( I ) = SCLFAC**IR ICAB = ISAMAX( IHI, A( 1, I ), 1 ) @@ -530,7 +530,7 @@ ICAB = ISAMAX( IHI, B( 1, I ), 1 ) CAB = MAX( CAB, ABS( B( ICAB, I ) ) ) LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE ) - JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) + JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) ) JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB ) RSCALE( I ) = SCLFAC**JC 360 CONTINUE diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index bbd032beb..56b4dba52 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -288,7 +288,7 @@ * CALL SGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = WORK( M+NP+1 ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**T*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f index 7ef8782b0..59addc3f4 100644 --- a/lapack-netlib/SRC/sgglse.f +++ b/lapack-netlib/SRC/sgglse.f @@ -276,7 +276,7 @@ * CALL SGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = WORK( P+MN+1 ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**T *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index c57b16a56..59b498da5 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL SGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**T*B. * diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index c4a78c347..8b7d4786a 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL SGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**T * diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f index 79a9c6092..10fb2b7d7 100644 --- a/lapack-netlib/SRC/shgeqz.f +++ b/lapack-netlib/SRC/shgeqz.f @@ -536,9 +536,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = ZERO GO TO 70 END IF @@ -564,10 +562,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = ZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/slaed0.c b/lapack-netlib/SRC/slaed0.c index 33f7134c1..4c5230907 100644 --- a/lapack-netlib/SRC/slaed0.c +++ b/lapack-netlib/SRC/slaed0.c @@ -823,10 +823,10 @@ L10: temp = log((real) (*n)) / log(2.f); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/slaed4.f b/lapack-netlib/SRC/slaed4.f index f056746d8..339c5029c 100644 --- a/lapack-netlib/SRC/slaed4.f +++ b/lapack-netlib/SRC/slaed4.f @@ -328,9 +328,12 @@ IF( C.LT.ZERO ) $ C = ABS( C ) IF( C.EQ.ZERO ) THEN -* ETA = B/A +* ETA = B/A * ETA = RHO - TAU - ETA = DLTUB - TAU +* ETA = DLTUB - TAU +* +* Update proposed by Li, Ren-Cang: + ETA = -W / ( DPSI+DPHI ) ELSE IF( A.GE.ZERO ) THEN ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C ) ELSE diff --git a/lapack-netlib/SRC/slaed7.c b/lapack-netlib/SRC/slaed7.c index 210d796d1..22fcaf76d 100644 --- a/lapack-netlib/SRC/slaed7.c +++ b/lapack-netlib/SRC/slaed7.c @@ -883,11 +883,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/slaeda.c b/lapack-netlib/SRC/slaeda.c index 7edaf8a76..3806427c2 100644 --- a/lapack-netlib/SRC/slaeda.c +++ b/lapack-netlib/SRC/slaeda.c @@ -753,7 +753,7 @@ f"> */ /* scheme */ i__1 = *curlvl - 1; - curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1; + curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1; /* Determine size of these matrices. We add HALF to the value of */ /* the SQRT in case the machine underestimates one of these square */ @@ -779,12 +779,12 @@ f"> */ /* rotations and permutation and then multiplying the center matrices */ /* against the current Z. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (k = 1; k <= i__1; ++k) { i__2 = *curlvl - k; i__3 = *curlvl - k - 1; - curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - + curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 1; psiz1 = prmptr[curr + 1] - prmptr[curr]; psiz2 = prmptr[curr + 2] - prmptr[curr + 1]; @@ -844,7 +844,7 @@ f"> */ c__1); i__2 = *tlvls - k; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L70: */ } diff --git a/lapack-netlib/SRC/slalsa.c b/lapack-netlib/SRC/slalsa.c index 53da2c7bf..77a79b80c 100644 --- a/lapack-netlib/SRC/slalsa.c +++ b/lapack-netlib/SRC/slalsa.c @@ -946,7 +946,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -960,7 +960,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1005,7 +1005,7 @@ L50: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index b9bae9376..a4f805674 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -286,8 +286,8 @@ * .. * .. Local Scalars .. REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -631,22 +635,25 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +713,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +739,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +758,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f index 15913be88..69f402914 100644 --- a/lapack-netlib/SRC/slaqz0.f +++ b/lapack-netlib/SRC/slaqz0.f @@ -318,7 +318,8 @@ PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP + REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP, + $ BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -330,7 +331,7 @@ * External Functions EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD, $ SLARTG, SROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, SLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -482,6 +483,9 @@ ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = SLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -558,15 +562,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/slarmm.c b/lapack-netlib/SRC/slarmm.c new file mode 100644 index 000000000..95114e2f1 --- /dev/null +++ b/lapack-netlib/SRC/slarmm.c @@ -0,0 +1,605 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLARMM */ + +/* Definition: */ +/* =========== */ + +/* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) */ + +/* REAL ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is REAL */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is REAL */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +real slarmm_(real *anorm, real *bnorm, real *cnorm) +{ + /* System generated locals */ + real ret_val; + + /* Local variables */ + extern real slamch_(char *); + real bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = slamch_("Safe minimum") / slamch_("Precision"); + bignum = 1.f / smlnum / 4.f; + +/* Compute a scale factor. */ + + ret_val = 1.f; + if (*bnorm <= 1.f) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5f; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5f / *bnorm; + } + } + return ret_val; + +/* ==== End of SLARMM ==== */ + +} /* slarmm_ */ + diff --git a/lapack-netlib/SRC/slarmm.f b/lapack-netlib/SRC/slarmm.f new file mode 100644 index 000000000..643dd6748 --- /dev/null +++ b/lapack-netlib/SRC/slarmm.f @@ -0,0 +1,99 @@ +*> \brief \b SLARMM +* +* Definition: +* =========== +* +* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* REAL ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> SLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is REAL +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is REAL +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is REAL +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + REAL ANORM, BNORM, CNORM +* .. Parameters .. + REAL ONE, HALF, FOUR + PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 ) +* .. +* .. Local Scalars .. + REAL BIGNUM, SMLNUM +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + SLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + SLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + SLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of SLARMM ==== +* + END diff --git a/lapack-netlib/SRC/slarscl2.f b/lapack-netlib/SRC/slarscl2.f index 5726f12cd..c7b77c908 100644 --- a/lapack-netlib/SRC/slarscl2.f +++ b/lapack-netlib/SRC/slarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> SLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> SLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is REAL array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slartg.f90 b/lapack-netlib/SRC/slartg.f90 index a9af1aa8d..8a5a8f26a 100644 --- a/lapack-netlib/SRC/slartg.f90 +++ b/lapack-netlib/SRC/slartg.f90 @@ -35,7 +35,7 @@ !> square root of the sum of squares. !> !> This version is discontinuous in R at F = 0 but it returns the same -!> C and S as SLARTG for complex inputs (F,0) and (G,0). +!> C and S as CLARTG for complex inputs (F,0) and (G,0). !> !> This is a more accurate version of the BLAS1 routine SROTG, !> with the following other differences: @@ -45,8 +45,6 @@ !> floating point operations (saves work in SBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine SLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, half=>shalf, one=>sone, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine SLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) @@ -143,20 +145,18 @@ subroutine SLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r*u end if return end subroutine diff --git a/lapack-netlib/SRC/slascl.f b/lapack-netlib/SRC/slascl.f index e1cb420ea..28cbd6514 100644 --- a/lapack-netlib/SRC/slascl.f +++ b/lapack-netlib/SRC/slascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/slascl2.f b/lapack-netlib/SRC/slascl2.f index 07b506a8c..5efc1cfcd 100644 --- a/lapack-netlib/SRC/slascl2.f +++ b/lapack-netlib/SRC/slascl2.f @@ -1,4 +1,4 @@ -*> \brief \b SLASCL2 performs diagonal scaling on a vector. +*> \brief \b SLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> SLASCL2 performs a diagonal scaling on a vector: +*> SLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is REAL array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slasd0.c b/lapack-netlib/SRC/slasd0.c index aa553579e..be1a74191 100644 --- a/lapack-netlib/SRC/slasd0.c +++ b/lapack-netlib/SRC/slasd0.c @@ -821,7 +821,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/slasda.c b/lapack-netlib/SRC/slasda.c index 71424c3f1..1d336d1ce 100644 --- a/lapack-netlib/SRC/slasda.c +++ b/lapack-netlib/SRC/slasda.c @@ -1023,7 +1023,7 @@ f"> */ /* Now conquer each subproblem bottom-up. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); for (lvl = nlvl; lvl >= 1; --lvl) { lvl2 = (lvl << 1) - 1; @@ -1035,7 +1035,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; diff --git a/lapack-netlib/SRC/slatbs.f b/lapack-netlib/SRC/slatbs.f index 617d0b2f5..77940f8cd 100644 --- a/lapack-netlib/SRC/slatbs.f +++ b/lapack-netlib/SRC/slatbs.f @@ -310,6 +310,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -317,7 +318,6 @@ * SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f index 94e0e88bc..0761d656f 100644 --- a/lapack-netlib/SRC/slatrs.f +++ b/lapack-netlib/SRC/slatrs.f @@ -264,8 +264,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX - REAL SASUM, SDOT, SLAMCH - EXTERNAL LSAME, ISAMAX, SASUM, SDOT, SLAMCH + REAL SASUM, SDOT, SLAMCH, SLANGE + EXTERNAL LSAME, ISAMAX, SASUM, SDOT, SLAMCH, SLANGE * .. * .. External Subroutines .. EXTERNAL SAXPY, SSCAL, STRSV, XERBLA @@ -304,6 +304,7 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -311,7 +312,6 @@ * SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -343,8 +343,67 @@ IF( TMAX.LE.BIGNUM ) THEN TSCAL = ONE ELSE - TSCAL = ONE / ( SMLNUM*TMAX ) - CALL SSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.SLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = ONE / ( SMLNUM*TMAX ) + CALL SSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be represented +* as floating-point number. Find the offdiagonal entry A( I, J ) +* with the largest absolute value. If this entry is not +/- Infinity, +* use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + $ TMAX ) + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1, + $ SUMJ ), TMAX ) + END DO + END IF +* + IF( TMAX.LE.SLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm without introducing Infinity +* in the summation + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + END IF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point entry. +* Rely on TRSV to propagate Inf and NaN. + CALL STRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/slatrs3.c b/lapack-netlib/SRC/slatrs3.c new file mode 100644 index 000000000..e5c48a55b --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.c @@ -0,0 +1,1262 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is REAL array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int slatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, real *a, integer *lda, real *x, + integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern logical lsame_(char *, char *); + real rscal; + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk, lscale; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, + integer *, real *, integer *, real *, real *, real *, integer *); + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "SLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + slatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = slange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + sscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = slange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + sscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + sscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of SLATRS3 */ + +} /* slatrs3_ */ + diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f new file mode 100644 index 000000000..c3a08e524 --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL SLATRS, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of SLATRS3 +* + END diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f index ad3eb269d..484d352f8 100644 --- a/lapack-netlib/SRC/sorbdb2.f +++ b/lapack-netlib/SRC/sorbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is REAL array, dimension (P) +*> TAUP1 is REAL array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is REAL array, dimension (M-P) +*> TAUP2 is REAL array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f index b18ed3b27..bf60fb7bb 100644 --- a/lapack-netlib/SRC/sorbdb4.f +++ b/lapack-netlib/SRC/sorbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is REAL array, dimension (P) +*> TAUP1 is REAL array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is REAL array, dimension (M-P) +*> TAUP2 is REAL array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f index a23b42beb..b2449e3be 100644 --- a/lapack-netlib/SRC/sorbdb6.f +++ b/lapack-netlib/SRC/sorbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,15 +173,18 @@ * ===================================================================== * * .. Parameters .. - REAL ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01E0, REALONE = 1.0E0, + REAL ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) REAL NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 ) * .. * .. Local Scalars .. - INTEGER I - REAL NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + REAL EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + REAL SLAMCH * .. * .. External Subroutines .. EXTERNAL SGEMV, SLASSQ, XERBLA @@ -210,17 +219,17 @@ CALL XERBLA( 'SORBDB6', -INFO ) RETURN END IF +* + EPS = SLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -238,27 +247,31 @@ CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -280,24 +293,22 @@ CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -306,4 +317,3 @@ * End of SORBDB6 * END - diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index 8f15523d4..b1a5c03a2 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -232,7 +232,7 @@ END IF END IF END IF - LWKOPT = WORK( 1 ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f index 9db8de08c..73862ed1b 100644 --- a/lapack-netlib/SRC/sspgvd.f +++ b/lapack-netlib/SRC/sspgvd.f @@ -307,8 +307,8 @@ CALL SSPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) - LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) - LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) + LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) ) + LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/sstedc.c b/lapack-netlib/SRC/sstedc.c index 46ed15a1a..61ad3dd37 100644 --- a/lapack-netlib/SRC/sstedc.c +++ b/lapack-netlib/SRC/sstedc.c @@ -804,10 +804,10 @@ f"> */ lwmin = *n - 1 << 1; } else { lgn = (integer) (log((real) (*n)) / log(2.f)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } if (icompz == 1) { diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index 8b90d9263..ac0d0284d 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -255,7 +255,7 @@ LWMIN = 2*N + 1 END IF LOPT = MAX( LWMIN, 2*N + - $ ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF WORK( 1 ) = LOPT diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f index 9002df237..7c7e0de01 100644 --- a/lapack-netlib/SRC/ssygvd.f +++ b/lapack-netlib/SRC/ssygvd.f @@ -330,8 +330,8 @@ CALL SSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL SSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK, $ INFO ) - LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) - LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) + LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) ) + LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f index 5f4062e9a..06a42dfb7 100644 --- a/lapack-netlib/SRC/ssysv.f +++ b/lapack-netlib/SRC/ssysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f index 9e0487623..9a7dfa4bb 100644 --- a/lapack-netlib/SRC/ssysv_rk.f +++ b/lapack-netlib/SRC/ssysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f index b4da1101c..fb7ba8c53 100644 --- a/lapack-netlib/SRC/ssysv_rook.f +++ b/lapack-netlib/SRC/ssysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssyswapr.f b/lapack-netlib/SRC/ssyswapr.f index 5e4265d7a..e1ab5a22a 100644 --- a/lapack-netlib/SRC/ssyswapr.f +++ b/lapack-netlib/SRC/ssyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is REAL array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by SSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is REAL array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - REAL A( LDA, N ) + REAL A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I REAL TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL SSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL SSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL SSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL SSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE SSYSWAPR diff --git a/lapack-netlib/SRC/stprfb.f b/lapack-netlib/SRC/stprfb.f index 64e8b34f5..d91a80dfb 100644 --- a/lapack-netlib/SRC/stprfb.f +++ b/lapack-netlib/SRC/stprfb.f @@ -1,4 +1,4 @@ -*> \brief \b STPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b STPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * @@ -37,7 +37,7 @@ *> \verbatim *> *> STPRFB applies a real "triangular-pentagonal" block reflector H or its -*> conjugate transpose H^H to a real matrix C, which is composed of two +*> transpose H**T to a real matrix C, which is composed of two *> blocks A and B, either from the left or right. *> *> \endverbatim @@ -48,15 +48,15 @@ *> \param[in] SIDE *> \verbatim *> SIDE is CHARACTER*1 -*> = 'L': apply H or H^H from the Left -*> = 'R': apply H or H^H from the Right +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right *> \endverbatim *> *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 *> = 'N': apply H (No transpose) -*> = 'C': apply H^H (Conjugate transpose) +*> = 'T': apply H**T (Transpose) *> \endverbatim *> *> \param[in] DIRECT @@ -145,7 +145,7 @@ *> (LDA,N) if SIDE = 'L' or (LDA,K) if SIDE = 'R' *> On entry, the K-by-N or M-by-K matrix A. *> On exit, A is overwritten by the corresponding block of -*> H*C or H^H*C or C*H or C*H^H. See Further Details. +*> H*C or H**T*C or C*H or C*H**T. See Further Details. *> \endverbatim *> *> \param[in] LDA @@ -161,7 +161,7 @@ *> B is REAL array, dimension (LDB,N) *> On entry, the M-by-N matrix B. *> On exit, B is overwritten by the corresponding block of -*> H*C or H^H*C or C*H or C*H^H. See Further Details. +*> H*C or H**T*C or C*H or C*H**T. See Further Details. *> \endverbatim *> *> \param[in] LDB @@ -327,13 +327,13 @@ * Let W = [ I ] (K-by-K) * [ V ] (M-by-K) * -* Form H C or H^H C where C = [ A ] (K-by-N) -* [ B ] (M-by-N) +* Form H C or H**T C where C = [ A ] (K-by-N) +* [ B ] (M-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - T (A + V^H B) or A = A - T^H (A + V^H B) -* B = B - V T (A + V^H B) or B = B - V T^H (A + V^H B) +* A = A - T (A + V**T B) or A = A - T**T (A + V**T B) +* B = B - V T (A + V**T B) or B = B - V T**T (A + V**T B) * * --------------------------------------------------------------------------- * @@ -388,12 +388,12 @@ * Let W = [ I ] (K-by-K) * [ V ] (N-by-K) * -* Form C H or C H^H where C = [ A B ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ A B ] (A is M-by-K, B is M-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - (A + B V) T or A = A - (A + B V) T^H -* B = B - (A + B V) T V^H or B = B - (A + B V) T^H V^H +* A = A - (A + B V) T or A = A - (A + B V) T**T +* B = B - (A + B V) T V**T or B = B - (A + B V) T**T V**T * * --------------------------------------------------------------------------- * @@ -448,13 +448,13 @@ * Let W = [ V ] (M-by-K) * [ I ] (K-by-K) * -* Form H C or H^H C where C = [ B ] (M-by-N) -* [ A ] (K-by-N) +* Form H C or H**T C where C = [ B ] (M-by-N) +* [ A ] (K-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - T (A + V^H B) or A = A - T^H (A + V^H B) -* B = B - V T (A + V^H B) or B = B - V T^H (A + V^H B) +* A = A - T (A + V**T B) or A = A - T**T (A + V**T B) +* B = B - V T (A + V**T B) or B = B - V T**T (A + V**T B) * * --------------------------------------------------------------------------- * @@ -510,12 +510,12 @@ * Let W = [ V ] (N-by-K) * [ I ] (K-by-K) * -* Form C H or C H^H where C = [ B A ] (B is M-by-N, A is M-by-K) +* Form C H or C H**T where C = [ B A ] (B is M-by-N, A is M-by-K) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - (A + B V) T or A = A - (A + B V) T^H -* B = B - (A + B V) T V^H or B = B - (A + B V) T^H V^H +* A = A - (A + B V) T or A = A - (A + B V) T**T +* B = B - (A + B V) T V**T or B = B - (A + B V) T**T V**T * * --------------------------------------------------------------------------- * @@ -569,13 +569,13 @@ * * Let W = [ I V ] ( I is K-by-K, V is K-by-M ) * -* Form H C or H^H C where C = [ A ] (K-by-N) -* [ B ] (M-by-N) +* Form H C or H**T C where C = [ A ] (K-by-N) +* [ B ] (M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - T (A + V B) or A = A - T^H (A + V B) -* B = B - V^H T (A + V B) or B = B - V^H T^H (A + V B) +* A = A - T (A + V B) or A = A - T**T (A + V B) +* B = B - V**T T (A + V B) or B = B - V**T T**T (A + V B) * * --------------------------------------------------------------------------- * @@ -629,12 +629,12 @@ * * Let W = [ I V ] ( I is K-by-K, V is K-by-N ) * -* Form C H or C H^H where C = [ A B ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ A B ] (A is M-by-K, B is M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - (A + B V^H) T or A = A - (A + B V^H) T^H -* B = B - (A + B V^H) T V or B = B - (A + B V^H) T^H V +* A = A - (A + B V**T) T or A = A - (A + B V**T) T**T +* B = B - (A + B V**T) T V or B = B - (A + B V**T) T**T V * * --------------------------------------------------------------------------- * @@ -688,13 +688,13 @@ * * Let W = [ V I ] ( I is K-by-K, V is K-by-M ) * -* Form H C or H^H C where C = [ B ] (M-by-N) -* [ A ] (K-by-N) +* Form H C or H**T C where C = [ B ] (M-by-N) +* [ A ] (K-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - T (A + V B) or A = A - T^H (A + V B) -* B = B - V^H T (A + V B) or B = B - V^H T^H (A + V B) +* A = A - T (A + V B) or A = A - T**T (A + V B) +* B = B - V**T T (A + V B) or B = B - V**T T**T (A + V B) * * --------------------------------------------------------------------------- * @@ -748,12 +748,12 @@ * * Let W = [ V I ] ( I is K-by-K, V is K-by-N ) * -* Form C H or C H^H where C = [ B A ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ B A ] (A is M-by-K, B is M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - (A + B V^H) T or A = A - (A + B V^H) T^H -* B = B - (A + B V^H) T V or B = B - (A + B V^H) T^H V +* A = A - (A + B V**T) T or A = A - (A + B V**T) T**T +* B = B - (A + B V**T) T V or B = B - (A + B V**T) T**T V * * --------------------------------------------------------------------------- * diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c new file mode 100644 index 000000000..8ce30ed56 --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.c @@ -0,0 +1,2066 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b STRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > STRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by SHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is REAL array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int strsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb, + real *c__, integer *ldc, real *scale, integer *iwork, integer *liwork, + real *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3; + + /* Local variables */ + real scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + real *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(real *); + integer l2, nb, pc, jj, ll; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *); + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + logical lquery; + extern /* Subroutine */ int strsyl_(char *, char *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, integer * + , real *, integer *); + integer nba, nbb; + real buf, sgn; + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "STRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } else if (! lquery && *liwork < iwork[1]) { + *info = -14; + } else if (! lquery && *ldswork < f2cmax(nba,nbb)) { + *info = -16; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("STRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return 0; + } + +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + strsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + + +/* REAL WNRM( MAX( M, N ) ) */ + wnrm=(real*)malloc (f2cmax(*m,*n)*sizeof(real)); + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0.f && a[l + 1 + l * a_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0.f && b[l + 1 + l * b_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = slange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = slange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + sgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = slange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__4, &i__5, &i__6, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = slange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + sscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + sgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = slange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = slange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__2, &i__3, &i__4, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + r__2 = scal, r__3 = (r__1 = c__[k + l * c_dim1], abs(r__1)); + scal = f2cmax(r__2,r__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + slascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return 0; + +/* End of STRSYL3 */ + +} /* strsyl3_ */ + diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f new file mode 100644 index 000000000..28762c2ed --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.f @@ -0,0 +1,1244 @@ +*> \brief \b STRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> STRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by SHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + REAL SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLANGE, SLAMCH, SLARMM + EXTERNAL SLANGE, SLAMCH, SLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLASCL, SSCAL, STRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of STRSYL3 +* + END diff --git a/lapack-netlib/SRC/zgebak.f b/lapack-netlib/SRC/zgebak.f index 9ec610efb..9a0f65a43 100644 --- a/lapack-netlib/SRC/zgebak.f +++ b/lapack-netlib/SRC/zgebak.f @@ -238,7 +238,7 @@ $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -252,7 +252,7 @@ $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/zgees.f b/lapack-netlib/SRC/zgees.f index 40fe78d34..d673087bf 100644 --- a/lapack-netlib/SRC/zgees.f +++ b/lapack-netlib/SRC/zgees.f @@ -282,7 +282,7 @@ * CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = DBLE( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/zgeesx.f b/lapack-netlib/SRC/zgeesx.f index ca4f5c913..bdd741b11 100644 --- a/lapack-netlib/SRC/zgeesx.f +++ b/lapack-netlib/SRC/zgeesx.f @@ -337,7 +337,7 @@ * CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = DBLE( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index 0c2226f9f..d1106696c 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -707,11 +707,11 @@ IF ( LQUERY ) THEN CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_ZGEQP3 = DBLE( CDUMMY(1) ) + LWRK_ZGEQP3 = INT( CDUMMY(1) ) CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_ZGEQRF = DBLE( CDUMMY(1) ) + LWRK_ZGEQRF = INT( CDUMMY(1) ) CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_ZGELQF = DBLE( CDUMMY(1) ) + LWRK_ZGELQF = INT( CDUMMY(1) ) END IF MINWRK = 2 OPTWRK = 2 @@ -727,7 +727,7 @@ IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, $ N+LWRK_ZGEQRF, LWRK_ZGESVJ ) @@ -763,10 +763,10 @@ IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMLQ = DBLE( CDUMMY(1) ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, $ N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF, @@ -802,10 +802,10 @@ IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF, $ LWRK_ZGESVJ, LWRK_ZUNMQRM ) @@ -864,26 +864,26 @@ IF ( LQUERY ) THEN CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQR = DBLE( CDUMMY(1) ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) IF ( .NOT. JRACC ) THEN CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_ZGEQP3N = DBLE( CDUMMY(1) ) + LWRK_ZGEQP3N = INT( CDUMMY(1) ) CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJU = DBLE( CDUMMY(1) ) + LWRK_ZGESVJU = INT( CDUMMY(1) ) CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJV = DBLE( CDUMMY(1) ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMLQ = DBLE( CDUMMY(1) ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, $ 2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, @@ -912,13 +912,13 @@ ELSE CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJV = DBLE( CDUMMY(1) ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMQR = DBLE( CDUMMY(1) ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, $ 2*N+LWRK_ZGEQRF, 2*N+N**2, diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f index e4aba6497..be53ba95b 100644 --- a/lapack-netlib/SRC/zgelss.f +++ b/lapack-netlib/SRC/zgelss.f @@ -266,11 +266,11 @@ * * Compute space needed for ZGEQRF CALL ZGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_ZGEQRF = DBLE( DUM(1) ) + LWORK_ZGEQRF = INT( DUM(1) ) * Compute space needed for ZUNMQR CALL ZUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_ZUNMQR = DBLE( DUM(1) ) + LWORK_ZUNMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'ZGEQRF', ' ', M, $ N, -1, -1 ) ) @@ -284,15 +284,15 @@ * Compute space needed for ZGEBRD CALL ZGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1), $ -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZGEBRD ) MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZUNMBR ) @@ -310,23 +310,23 @@ * Compute space needed for ZGELQF CALL ZGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_ZGELQF = DBLE( DUM(1) ) + LWORK_ZGELQF = INT( DUM(1) ) * Compute space needed for ZGEBRD CALL ZGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) * Compute space needed for ZUNMLQ CALL ZUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMLQ = DBLE( DUM(1) ) + LWORK_ZUNMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_ZGELQF MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_ZGEBRD ) @@ -345,15 +345,15 @@ * Compute space needed for ZGEBRD CALL ZGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) MAXWRK = 2*M + LWORK_ZGEBRD MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNMBR ) MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNGBR ) diff --git a/lapack-netlib/SRC/zgelst.c b/lapack-netlib/SRC/zgelst.c new file mode 100644 index 000000000..447cd30bb --- /dev/null +++ b/lapack-netlib/SRC/zgelst.c @@ -0,0 +1,1115 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download ZGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */ +/* > or LQ factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'C' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'C' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'C': the linear system involves A**H. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by ZGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by ZGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'C'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > modulus of elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of the modulus of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16GEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ int zgelst_(char *trans, integer *m, integer *n, integer * + nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, + doublecomplex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; + doublereal d__1; + + /* Local variables */ + doublereal anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + doublereal rwork[1]; + integer lwopt; + extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + integer nb; + extern doublereal dlamch_(char *); + integer mn; + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + doublereal bignum; + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublecomplex *, + integer *, integer *), zlaset_(char *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *); + integer mnnrhs; + extern /* Subroutine */ int zgelqt_(integer *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *); + doublereal smlnum; + extern /* Subroutine */ int zgeqrt_(integer *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *); + logical lquery; + extern /* Subroutine */ int ztrtrs_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + integer *), zgemlqt_(char *, char *, + integer *, integer *, integer *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *), zgemqrt_(char *, + char *, integer *, integer *, integer *, integer *, doublecomplex + *, integer *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "C"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "ZGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGELST ", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + return 0; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "ZGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = dlamch_("S") / dlamch_("P"); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = zlange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0. && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + zlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + zlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + return 0; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = zlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0. && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + zlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + zlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + zgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + ztrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + ztrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0., b[i__3].i = 0.; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + zgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + ztrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0., b[i__3].i = 0.; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + ztrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return 0; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + zlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + zlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + zlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + zlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + + return 0; + +/* End of ZGELST */ + +} /* zgelst_ */ + diff --git a/lapack-netlib/SRC/zgelst.f b/lapack-netlib/SRC/zgelst.f new file mode 100644 index 000000000..4dabdc91e --- /dev/null +++ b/lapack-netlib/SRC/zgelst.f @@ -0,0 +1,533 @@ +*> \brief ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR +*> or LQ factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'C' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'C' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'C': the linear system involves A**H. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by ZGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by ZGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'C'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> modulus of elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of the modulus of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL LSAME, ILAENV, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZGELQT, ZGEQRT, ZGEMLQT, ZGEMQRT, DLABAD, + $ ZLASCL, ZLASET, ZTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'ZGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = DBLE( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'ZGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL DLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = ZLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = ZLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL ZLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL ZLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL ZGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL ZTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL ZTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit', + $ N, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL ZGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL ZTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL ZTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit', + $ M, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL ZLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL ZLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = DBLE( LWOPT ) +* + RETURN +* +* End of ZGELST +* + END diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f index 6c24131aa..62b4acdec 100644 --- a/lapack-netlib/SRC/zggglm.f +++ b/lapack-netlib/SRC/zggglm.f @@ -289,7 +289,7 @@ * CALL ZGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = DBLE( WORK( M+NP+1 ) ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**H*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/zgglse.f b/lapack-netlib/SRC/zgglse.f index e5869a7d4..cc558bc40 100644 --- a/lapack-netlib/SRC/zgglse.f +++ b/lapack-netlib/SRC/zgglse.f @@ -276,7 +276,7 @@ * CALL ZGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = DBLE( WORK( P+MN+1 ) ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**H *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 93b1dc0fc..0388b0874 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -276,7 +276,7 @@ * QR factorization of N-by-M matrix A: A = Q*R * CALL ZGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = DBLE( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**H*B. * diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index a2d4a9d55..be912c772 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -275,7 +275,7 @@ * RQ factorization of M-by-N matrix A: A = R*Q * CALL ZGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = DBLE( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**H * diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index a6484eb03..7f58c7f72 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -284,7 +284,7 @@ LIWMIN = 1 END IF LOPT = MAX( LWMIN, N + - $ ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) ) LROPT = LRWMIN LIOPT = LIWMIN END IF diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f index 2e92255df..eeda656ad 100644 --- a/lapack-netlib/SRC/zhegvd.f +++ b/lapack-netlib/SRC/zhegvd.f @@ -360,9 +360,9 @@ CALL ZHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL ZHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, INFO ) - LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) - LROPT = MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) ) - LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) + LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) ) + LROPT = INT( MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) ) ) + LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/zhesv_rk.f b/lapack-netlib/SRC/zhesv_rk.f index 1ec75cc04..6333e9f36 100644 --- a/lapack-netlib/SRC/zhesv_rk.f +++ b/lapack-netlib/SRC/zhesv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL ZHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index 302b69f34..c15e7aace 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -524,9 +524,7 @@ END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = CZERO GO TO 50 END IF @@ -552,10 +550,7 @@ * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = CZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f index d27cdc761..e96e39738 100644 --- a/lapack-netlib/SRC/zhpgvd.f +++ b/lapack-netlib/SRC/zhpgvd.f @@ -335,9 +335,9 @@ CALL ZHPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL ZHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, INFO ) - LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) - LRWMIN = MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) ) - LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) + LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) ) + LRWMIN = INT( MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) ) ) + LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/zlaed0.c b/lapack-netlib/SRC/zlaed0.c index 37bd12b01..2b25f6e4e 100644 --- a/lapack-netlib/SRC/zlaed0.c +++ b/lapack-netlib/SRC/zlaed0.c @@ -793,10 +793,10 @@ L10: temp = log((doublereal) (*n)) / log(2.); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; diff --git a/lapack-netlib/SRC/zlaed7.c b/lapack-netlib/SRC/zlaed7.c index 093051917..8665ee12c 100644 --- a/lapack-netlib/SRC/zlaed7.c +++ b/lapack-netlib/SRC/zlaed7.c @@ -864,11 +864,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; diff --git a/lapack-netlib/SRC/zlag2c.f b/lapack-netlib/SRC/zlag2c.f index ba141a98f..434590bb9 100644 --- a/lapack-netlib/SRC/zlag2c.f +++ b/lapack-netlib/SRC/zlag2c.f @@ -124,7 +124,7 @@ DOUBLE PRECISION RMAX * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, DIMAG + INTRINSIC DBLE, DIMAG, CMPLX * .. * .. External Functions .. REAL SLAMCH @@ -142,7 +142,7 @@ INFO = 1 GO TO 30 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 10 CONTINUE 20 CONTINUE INFO = 0 diff --git a/lapack-netlib/SRC/zlaic1.f b/lapack-netlib/SRC/zlaic1.f index 72948cde9..47927e778 100644 --- a/lapack-netlib/SRC/zlaic1.f +++ b/lapack-netlib/SRC/zlaic1.f @@ -348,9 +348,9 @@ B = ( ZETA2*ZETA2+ZETA1*ZETA1-ONE )*HALF C = ZETA1*ZETA1 IF( B.GE.ZERO ) THEN - T = -C / ( B+SQRT( B*B+C ) ) + T = DBLE( -C / ( B+SQRT( B*B+C ) ) ) ELSE - T = B - SQRT( B*B+C ) + T = DBLE( B - SQRT( B*B+C ) ) END IF SINE = -( ALPHA / ABSEST ) / T COSINE = -( GAMMA / ABSEST ) / ( ONE+T ) diff --git a/lapack-netlib/SRC/zlalsa.c b/lapack-netlib/SRC/zlalsa.c index d17016e7d..cd0819c3d 100644 --- a/lapack-netlib/SRC/zlalsa.c +++ b/lapack-netlib/SRC/zlalsa.c @@ -1051,7 +1051,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -1065,7 +1065,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1110,7 +1110,7 @@ L170: ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 3185508bc..4fa5ee5b0 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -279,7 +279,7 @@ PARAMETER ( RZERO = 0.0d0, RONE = 1.0d0 ) * .. * .. Local Scalars .. - COMPLEX*16 ALPHA, BETA, CDUM, REFSUM + COMPLEX*16 ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*DCONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ ELSE JBOT = KBOT END IF + T1 = DCONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ DCONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -610,25 +611,29 @@ * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + $ + DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + DCONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +693,15 @@ * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + DCONJG( V( 2, M ) )*H( K+2, J ) + $ + DCONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +719,15 @@ I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +738,15 @@ * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f index 2616f20b5..0d8884ed5 100644 --- a/lapack-netlib/SRC/zlaqz0.f +++ b/lapack-netlib/SRC/zlaqz0.f @@ -300,7 +300,8 @@ PARAMETER( ZERO = 0.0D0, ONE = 1.0D0, HALF = 0.5D0 ) * Local scalars - DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, + $ BNORM, BTOL COMPLEX*16 :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -313,7 +314,7 @@ * External Functions EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD, $ ZLARTG, ZROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -467,6 +468,9 @@ ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = ZLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -529,15 +533,8 @@ * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it diff --git a/lapack-netlib/SRC/zlarscl2.f b/lapack-netlib/SRC/zlarscl2.f index 4a1e1603a..e61865906 100644 --- a/lapack-netlib/SRC/zlarscl2.f +++ b/lapack-netlib/SRC/zlarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> ZLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> ZLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX*16 array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zlartg.f90 b/lapack-netlib/SRC/zlartg.f90 index 337a4dda8..a4f9bd4b0 100644 --- a/lapack-netlib/SRC/zlartg.f90 +++ b/lapack-netlib/SRC/zlartg.f90 @@ -11,8 +11,8 @@ ! SUBROUTINE ZLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C -! COMPLEX(wp) F, G, R, S +! REAL(wp) C +! COMPLEX(wp) F, G, R, S ! .. ! !> \par Purpose: @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,6 +38,10 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be !> identical to those returned by DLARTG. @@ -46,11 +50,8 @@ !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine ZROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine ZROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -117,7 +115,7 @@ subroutine ZLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, two=>dtwo, czero=>zzero, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +127,7 @@ subroutine ZLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +139,9 @@ subroutine ZLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -149,30 +150,43 @@ subroutine ZLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -181,32 +195,51 @@ subroutine ZLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,19 +247,43 @@ subroutine ZLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return diff --git a/lapack-netlib/SRC/zlascl.f b/lapack-netlib/SRC/zlascl.f index 3d53f5ae6..4cce5ff5e 100644 --- a/lapack-netlib/SRC/zlascl.f +++ b/lapack-netlib/SRC/zlascl.f @@ -272,6 +272,8 @@ ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/zlascl2.f b/lapack-netlib/SRC/zlascl2.f index c4e6992fb..26406c363 100644 --- a/lapack-netlib/SRC/zlascl2.f +++ b/lapack-netlib/SRC/zlascl2.f @@ -1,4 +1,4 @@ -*> \brief \b ZLASCL2 performs diagonal scaling on a vector. +*> \brief \b ZLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> ZLASCL2 performs a diagonal scaling on a vector: +*> ZLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX*16 array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zlat2c.f b/lapack-netlib/SRC/zlat2c.f index 1d607dcea..a413b05c1 100644 --- a/lapack-netlib/SRC/zlat2c.f +++ b/lapack-netlib/SRC/zlat2c.f @@ -130,7 +130,7 @@ LOGICAL UPPER * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, DIMAG + INTRINSIC DBLE, DIMAG, CMPLX * .. * .. External Functions .. REAL SLAMCH @@ -151,7 +151,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE @@ -164,7 +164,7 @@ INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 30 CONTINUE 40 CONTINUE END IF diff --git a/lapack-netlib/SRC/zlatbs.f b/lapack-netlib/SRC/zlatbs.f index b7b2cb8ae..bdffa1ea9 100644 --- a/lapack-netlib/SRC/zlatbs.f +++ b/lapack-netlib/SRC/zlatbs.f @@ -278,7 +278,7 @@ $ ZDOTU, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV, DLABAD + EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -324,17 +324,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = DLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / DLAMCH( 'Precision' ) + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/zlatrs.f b/lapack-netlib/SRC/zlatrs.f index 91bb688ec..2276ace87 100644 --- a/lapack-netlib/SRC/zlatrs.f +++ b/lapack-netlib/SRC/zlatrs.f @@ -274,7 +274,7 @@ $ ZDOTU, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV, DLABAD + EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -318,17 +318,14 @@ * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = DLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / DLAMCH( 'Precision' ) + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -360,8 +357,74 @@ IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE - TSCAL = HALF / ( SMLNUM*TMAX ) - CALL DSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.DLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = HALF / ( SMLNUM*TMAX ) + CALL DSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be +* represented as a floating-point number. Find the +* maximum offdiagonal absolute value +* max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is +* not +/- Infinity, use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + DO I = 1, J - 1 + TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ), + $ ABS( DIMAG(A ( I, J ) ) ) ) + END DO + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + DO I = J + 1, N + TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ), + $ ABS( DIMAG(A ( I, J ) ) ) ) + END DO + END DO + END IF +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm of each column without +* introducing Infinity in the summation. + TSCAL = TWO * TSCAL + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + END IF + TSCAL = TSCAL * HALF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point +* entry. Rely on TRSV to propagate Inf and NaN. + CALL ZTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/zlatrs3.c b/lapack-netlib/SRC/zlatrs3.c new file mode 100644 index 000000000..0cb1cda54 --- /dev/null +++ b/lapack-netlib/SRC/zlatrs3.c @@ -0,0 +1,1283 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX*16 A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX*16 array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int zlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublecomplex *a, integer *lda, + doublecomplex *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + doublereal w[64]; + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast; + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit; + extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + integer *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublereal *, integer *); + logical lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZLATRS3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return 0; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + zlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return 0; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = zlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + zdscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = zlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + zdscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + zdscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__6, &i__7, &i__8, &z__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("T", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return 0; + +/* End of ZLATRS3 */ + +} /* zlatrs3_ */ + diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f new file mode 100644 index 000000000..fc1be0517 --- /dev/null +++ b/lapack-netlib/SRC/zlatrs3.f @@ -0,0 +1,667 @@ +*> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX*16 A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( LDX, * ) + DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE, DLARMM + EXTERNAL ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM +* .. +* .. External Subroutines .. + EXTERNAL ZLATRS, ZDSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1) * NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO + +* +* Reduce local scaling factors +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of ZLATRS3 +* + END diff --git a/lapack-netlib/SRC/zstedc.c b/lapack-netlib/SRC/zstedc.c index 4cfc41840..55baba2d7 100644 --- a/lapack-netlib/SRC/zstedc.c +++ b/lapack-netlib/SRC/zstedc.c @@ -836,10 +836,10 @@ f"> */ lrwmin = *n - 1 << 1; } else if (icompz == 1) { lgn = (integer) (log((doublereal) (*n)) / log(2.)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } lwmin = *n * *n; diff --git a/lapack-netlib/SRC/zsysv.f b/lapack-netlib/SRC/zsysv.f index ed173dadc..44f1e25b1 100644 --- a/lapack-netlib/SRC/zsysv.f +++ b/lapack-netlib/SRC/zsysv.f @@ -223,7 +223,7 @@ LWKOPT = 1 ELSE CALL ZSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsysv_rk.f b/lapack-netlib/SRC/zsysv_rk.f index df828ee33..8d9fb82c8 100644 --- a/lapack-netlib/SRC/zsysv_rk.f +++ b/lapack-netlib/SRC/zsysv_rk.f @@ -280,7 +280,7 @@ LWKOPT = 1 ELSE CALL ZSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsysv_rook.f b/lapack-netlib/SRC/zsysv_rook.f index 7c9fb4bf6..745339512 100644 --- a/lapack-netlib/SRC/zsysv_rook.f +++ b/lapack-netlib/SRC/zsysv_rook.f @@ -256,7 +256,7 @@ LWKOPT = 1 ELSE CALL ZSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsyswapr.f b/lapack-netlib/SRC/zsyswapr.f index 1f1a87857..eb3c98c34 100644 --- a/lapack-netlib/SRC/zsyswapr.f +++ b/lapack-netlib/SRC/zsyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is COMPLEX*16 array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by ZSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is COMPLEX*16 array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, N ) + COMPLEX*16 A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I COMPLEX*16 TMP * * .. External Functions .. @@ -143,19 +140,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL ZSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL ZSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL ZSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL ZSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE ZSYSWAPR diff --git a/lapack-netlib/SRC/ztprfb.f b/lapack-netlib/SRC/ztprfb.f index 2edbd0566..7b1bc17a0 100644 --- a/lapack-netlib/SRC/ztprfb.f +++ b/lapack-netlib/SRC/ztprfb.f @@ -1,4 +1,4 @@ -*> \brief \b ZTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b ZTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c new file mode 100644 index 000000000..314b0f98d --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.c @@ -0,0 +1,2027 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX*16 array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complex16SYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ int ztrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex + *b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *scale, + doublereal *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3, d__4; + doublecomplex z__1; + + /* Local variables */ + doublereal scal; + doublecomplex csgn; + doublereal anrm, bnrm, cnrm; + integer awrk, bwrk; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(doublereal *); + integer nb, jj, ll; + extern doublereal dlamch_(char *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ int zdscal_(integer *, doublereal *, + doublecomplex *, integer *), zlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublecomplex * + , integer *, integer *); + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int ztrsyl_(char *, char *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZTRSYL3", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ztrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return 0; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + z__1.r = sgn, z__1.i = 0.; + csgn.r = z__1.r, csgn.i = z__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = zlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = zlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__4, &i__5, &i__6, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = zlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__4, &i__5, &i__6, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = zlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + zdscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__3, &i__4, &i__5, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = zlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = zlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__2, &i__3, &i__4, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return 0; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + d__3 = (d__1 = c__[i__1].r, abs(d__1)), d__4 = (d__2 = d_imag(&c__[ + c_dim1 + 1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + d__3 = scal, d__4 = (d__1 = c__[i__3].r, abs(d__1)), d__3 = + f2cmax(d__3,d__4), d__4 = (d__2 = d_imag(&c__[k + l * + c_dim1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + zlascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return 0; + +/* End of ZTRSYL3 */ + +} /* ztrsyl3_ */ + diff --git a/lapack-netlib/SRC/ztrsyl3.f b/lapack-netlib/SRC/ztrsyl3.f new file mode 100644 index 000000000..b5a058da4 --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b ZTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> ZTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complex16SYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) + DOUBLE PRECISION SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX*16 CSGN +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLARMM, ZLANGE + EXTERNAL DLAMCH, DLARMM, ILAENV, LSAME, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMM, ZLASCL, ZTRSYL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DIMAG, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'ZTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) + CSGN = DCMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( DBLE( C( 1, 1 ) ) ), + $ ABS( DIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( DBLE ( C( K, L ) ) ), + $ ABS( DIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL ZLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of ZTRSYL3 +* + END diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f index 412d8d8d0..46b08aa1e 100644 --- a/lapack-netlib/SRC/zunbdb2.f +++ b/lapack-netlib/SRC/zunbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX*16 array, dimension (P) +*> TAUP1 is COMPLEX*16 array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX*16 array, dimension (M-P) +*> TAUP2 is COMPLEX*16 array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f index b1fcd8bd0..4672cfa67 100644 --- a/lapack-netlib/SRC/zunbdb4.f +++ b/lapack-netlib/SRC/zunbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX*16 array, dimension (P) +*> TAUP1 is COMPLEX*16 array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX*16 array, dimension (M-P) +*> TAUP2 is COMPLEX*16 array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f index ec681b597..ed666e449 100644 --- a/lapack-netlib/SRC/zunbdb6.f +++ b/lapack-netlib/SRC/zunbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,16 +173,19 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01D0, REALONE = 1.0D0, + DOUBLE PRECISION ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) COMPLEX*16 NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0), $ ZERO = (0.0D0,0.0D0) ) * .. * .. Local Scalars .. - INTEGER I - DOUBLE PRECISION NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + DOUBLE PRECISION EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH * .. * .. External Subroutines .. EXTERNAL ZGEMV, ZLASSQ, XERBLA @@ -211,17 +220,17 @@ CALL XERBLA( 'ZUNBDB6', -INFO ) RETURN END IF +* + EPS = DLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -239,27 +248,31 @@ CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -281,24 +294,22 @@ CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -307,4 +318,3 @@ * End of ZUNBDB6 * END - diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f index 3dfca43be..c42a372c5 100644 --- a/lapack-netlib/SRC/zungbr.f +++ b/lapack-netlib/SRC/zungbr.f @@ -233,7 +233,7 @@ END IF END IF END IF - LWKOPT = DBLE( WORK( 1 ) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 226004a90..d252c7fa9 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -40,7 +40,7 @@ set(SEIGTST schkee.F sget54.f sglmts.f sgqrts.f sgrqts.f sgsvts3.f shst01.f slarfy.f slarhs.f slatm4.f slctes.f slctsx.f slsets.f sort01.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f - sstt22.f ssyt21.f ssyt22.f) + sstt22.f ssyl01.f ssyt21.f ssyt22.f) set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f @@ -56,7 +56,7 @@ set(CEIGTST cchkee.F cget54.f cglmts.f cgqrts.f cgrqts.f cgsvts3.f chbt21.f chet21.f chet22.f chpt21.f chst01.f clarfy.f clarhs.f clatm4.f clctes.f clctsx.f clsets.f csbmv.f - csgt01.f cslect.f + csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f @@ -77,7 +77,7 @@ set(DEIGTST dchkee.F dget54.f dglmts.f dgqrts.f dgrqts.f dgsvts3.f dhst01.f dlarfy.f dlarhs.f dlatm4.f dlctes.f dlctsx.f dlsets.f dort01.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f - dstt22.f dsyt21.f dsyt22.f) + dstt22.f dsyl01.f dsyt21.f dsyt22.f) set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f @@ -93,7 +93,7 @@ set(ZEIGTST zchkee.F zget54.f zglmts.f zgqrts.f zgrqts.f zgsvts3.f zhbt21.f zhet21.f zhet22.f zhpt21.f zhst01.f zlarfy.f zlarhs.f zlatm4.f zlctes.f zlctsx.f zlsets.f zsbmv.f - zsgt01.f zslect.f + zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) macro(add_eig_executable name) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index bccfccf95..942ae6982 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -62,7 +62,7 @@ SEIGTST = schkee.o \ sget54.o sglmts.o sgqrts.o sgrqts.o sgsvts3.o \ shst01.o slarfy.o slarhs.o slatm4.o slctes.o slctsx.o slsets.o sort01.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ - sstt22.o ssyt21.o ssyt22.o + sstt22.o ssyl01.o ssyt21.o ssyt22.o CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ @@ -78,7 +78,7 @@ CEIGTST = cchkee.o \ cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts3.o \ chbt21.o chet21.o chet22.o chpt21.o chst01.o \ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ - csgt01.o cslect.o \ + csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ @@ -99,7 +99,7 @@ DEIGTST = dchkee.o \ dget54.o dglmts.o dgqrts.o dgrqts.o dgsvts3.o \ dhst01.o dlarfy.o dlarhs.o dlatm4.o dlctes.o dlctsx.o dlsets.o dort01.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ - dstt22.o dsyt21.o dsyt22.o + dstt22.o dsyl01.o dsyt21.o dsyt22.o ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ @@ -115,7 +115,7 @@ ZEIGTST = zchkee.o \ zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts3.o \ zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ - zsgt01.o zslect.o \ + zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o .PHONY: all diff --git a/lapack-netlib/TESTING/EIG/cchkec.f b/lapack-netlib/TESTING/EIG/cchkec.f index 6727a0954..c892b0a54 100644 --- a/lapack-netlib/TESTING/EIG/cchkec.f +++ b/lapack-netlib/TESTING/EIG/cchkec.f @@ -23,7 +23,7 @@ *> \verbatim *> *> CCHKEC tests eigen- condition estimation routines -*> CTRSYL, CTREXC, CTRSNA, CTRSEN +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA, CTRSEN *> *> In all cases, the routine runs through a fixed set of numerical *> examples, subjects them to various tests, and compares the test @@ -88,17 +88,17 @@ * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - REAL EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + REAL EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38 + EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38, CSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -120,10 +120,24 @@ $ CALL CERREC( PATH, NOUT ) * OK = .TRUE. - CALL CGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL CGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL CSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL CGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -169,6 +183,12 @@ $ / ' Safe minimum (SFMIN) = ', E16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9972 FORMAT( 'CTRSYL and CTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in CTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in CTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) RETURN * * End of CCHKEC diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f index a93933a27..d15b39d01 100644 --- a/lapack-netlib/TESTING/EIG/cdrvsg.f +++ b/lapack-netlib/TESTING/EIG/cdrvsg.f @@ -663,8 +663,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*SLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*SLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/cerrec.f b/lapack-netlib/TESTING/EIG/cerrec.f index 650ab2b6e..6e2e1d38a 100644 --- a/lapack-netlib/TESTING/EIG/cerrec.f +++ b/lapack-netlib/TESTING/EIG/cerrec.f @@ -23,7 +23,7 @@ *> *> CERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> CTRSYL, CTREXC, CTRSNA and CTRSEN. +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA and CTRSEN. *> \endverbatim * * Arguments: @@ -77,12 +77,12 @@ * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - REAL RW( LW ), S( NMAX ), SEP( NMAX ) + REAL RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL + EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL, CTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'CTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test CTRSYL3 +* + SRNAMT = 'CTRSYL3' + INFOT = 1 + CALL CTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test CTREXC * SRNAMT = 'CTREXC' diff --git a/lapack-netlib/TESTING/EIG/cget37.f b/lapack-netlib/TESTING/EIG/cget37.f index c2a6589f3..44d4580d6 100644 --- a/lapack-netlib/TESTING/EIG/cget37.f +++ b/lapack-netlib/TESTING/EIG/cget37.f @@ -265,7 +265,7 @@ 100 CONTINUE WSRT( KMIN ) = WSRT( I ) WSRT( I ) = VMIN - VCMIN = WTMP( I ) + VCMIN = REAL( WTMP( I ) ) WTMP( I ) = W( KMIN ) WTMP( KMIN ) = VCMIN VMIN = STMP( KMIN ) diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f new file mode 100644 index 000000000..82d790daa --- /dev/null +++ b/lapack-netlib/TESTING/EIG/csyl01.f @@ -0,0 +1,294 @@ +*> \brief \b CSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CSYL01 tests CTRSYL and CTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements CGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual CTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual CTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times CTRSYL3 and CTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of CTRSYL +*> RMAX(2) = Value of the largest test ratio of CTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times CTRSYL where INFO is nonzero +*> NINFO(2) = No. of times CTRSYL3 where INFO is nonzero +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX RMUL +* .. +* .. Local Arrays .. + COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ) + REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, CLANGE + EXTERNAL SISNAN, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CLATMR, CLACPY, CGEMM, CTRSYL, CTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.5E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 23 + KLA = 0 + KUA = M - 1 + CALL CLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = CLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 29 + KLB = 0 + KUB = N - 1 + CALL CLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = CLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL CLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL CTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of CSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/dchkec.f b/lapack-netlib/TESTING/EIG/dchkec.f index 854961884..c4451a627 100644 --- a/lapack-netlib/TESTING/EIG/dchkec.f +++ b/lapack-netlib/TESTING/EIG/dchkec.f @@ -90,21 +90,23 @@ LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC DOUBLE PRECISION EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL DERREC, DGET31, DGET32, DGET33, DGET34, DGET35, - $ DGET36, DGET37, DGET38, DGET39, DGET40 + $ DGET36, DGET37, DGET38, DGET39, DGET40, DSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -153,10 +155,24 @@ WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL DGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL DGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL DSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL DGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in DTGEXC: RMAX =', D12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'DTRSYL and DTRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in DTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in DTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of DCHKEC * diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f index 0b49c8404..2e9d3c643 100644 --- a/lapack-netlib/TESTING/EIG/ddrvsg.f +++ b/lapack-netlib/TESTING/EIG/ddrvsg.f @@ -645,8 +645,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*DLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*DLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/derrec.f b/lapack-netlib/TESTING/EIG/derrec.f index d5863ad42..f11f48887 100644 --- a/lapack-netlib/TESTING/EIG/derrec.f +++ b/lapack-netlib/TESTING/EIG/derrec.f @@ -23,7 +23,7 @@ *> *> DERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> DTRSYL, DTREXC, DTRSNA and DTRSEN. +*> DTRSYL, DTRSYL3, DTREXC, DTRSNA and DTRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL + EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL, DTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'DTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test DTRSYL3 +* + SRNAMT = 'DTRSYL3' + INFOT = 1 + CALL DTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test DTREXC * SRNAMT = 'DTREXC' diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f index 6df517825..11a932052 100644 --- a/lapack-netlib/TESTING/EIG/derred.f +++ b/lapack-netlib/TESTING/EIG/derred.f @@ -99,7 +99,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, DGEES, DGEESX, DGEEV, DGEEVX, DGEJSV, - $ DGESDD, DGESVD, DGESVDX, DGESVQ + $ DGESDD, DGESVD, DGESVDX, DGESVDQ * .. * .. External Functions .. LOGICAL DSLECT, LSAMEN diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f new file mode 100644 index 000000000..782d2cd42 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dsyl01.f @@ -0,0 +1,288 @@ +*> \brief \b DSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYL01 tests DTRSYL and DTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements DGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual DTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual DTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times DTRSYL3 and DTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION, dimension (2) +*> RMAX(1) = Value of the largest test ratio of DTRSYL +*> RMAX(2) = Value of the largest test ratio of DTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times DTRSYL returns an expected INFO +*> NINFO(2) = No. of times DTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 245, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + DOUBLE PRECISION A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 126 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DLATMR, DLACPY, DGEMM, DTRSYL, DTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.000001D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL DLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = DLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL DLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = DLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL DLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL DTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of DSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/schkec.f b/lapack-netlib/TESTING/EIG/schkec.f index e6123e1ad..59abb2466 100644 --- a/lapack-netlib/TESTING/EIG/schkec.f +++ b/lapack-netlib/TESTING/EIG/schkec.f @@ -90,21 +90,23 @@ LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC REAL EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL SERREC, SGET31, SGET32, SGET33, SGET34, SGET35, - $ SGET36, SGET37, SGET38, SGET39, SGET40 + $ SGET36, SGET37, SGET38, SGET39, SGET40, SSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -153,10 +155,24 @@ WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL SGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL SGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL SSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL SGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in STGEXC: RMAX =', E12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'STRSYL and STRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in STRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in STRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of SCHKEC * diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f index 4a57223c8..877579bcd 100644 --- a/lapack-netlib/TESTING/EIG/sdrvsg.f +++ b/lapack-netlib/TESTING/EIG/sdrvsg.f @@ -645,8 +645,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*SLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*SLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/serrec.f b/lapack-netlib/TESTING/EIG/serrec.f index 249f0e642..9a7ceb362 100644 --- a/lapack-netlib/TESTING/EIG/serrec.f +++ b/lapack-netlib/TESTING/EIG/serrec.f @@ -23,7 +23,7 @@ *> *> SERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> STRSYL, STREXC, STRSNA and STRSEN. +*> STRSYL, STRSYL3, STREXC, STRSNA and STRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL + EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL, STRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ CALL CHKXER( 'STRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test STRSYL3 +* + SRNAMT = 'STRSYL3' + INFOT = 1 + CALL STRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test STREXC * SRNAMT = 'STREXC' diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f new file mode 100644 index 000000000..22d089dc8 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/ssyl01.f @@ -0,0 +1,288 @@ +*> \brief \b SSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYL01 tests STRSYL and STRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements SGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual STRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual STRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times STRSYL3 and STRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is REAL, dimension (2) +*> RMAX(1) = Value of the largest test ratio of STRSYL +*> RMAX(2) = Value of the largest test ratio of STRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times STRSYL returns an expected INFO +*> NINFO(2) = No. of times STRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + REAL A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 54 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, SLANGE + EXTERNAL SISNAN, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SLATMR, SLACPY, SGEMM, STRSYL, STRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.05E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL SLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = SLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL SLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = SLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL SLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL STRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ C, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, C, MAXM ) + RES1 = SLANGE( 'M', M, N, C, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL STRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = SLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of SSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/zchkec.f b/lapack-netlib/TESTING/EIG/zchkec.f index 1e1c29e0d..62a76d357 100644 --- a/lapack-netlib/TESTING/EIG/zchkec.f +++ b/lapack-netlib/TESTING/EIG/zchkec.f @@ -88,17 +88,17 @@ * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - DOUBLE PRECISION EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + DOUBLE PRECISION EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38 + EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38, ZSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -120,10 +120,24 @@ $ CALL ZERREC( PATH, NOUT ) * OK = .TRUE. - CALL ZGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL ZGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL ZSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL ZGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -148,7 +162,7 @@ WRITE( NOUT, FMT = 9996 )RTRSEN, LTRSEN, NTRSEN, KTRSEN END IF * - NTESTS = KTRSYL + KTREXC + KTRSNA + KTRSEN + NTESTS = KTRSYL + KTRSYL3 + KTREXC + KTRSNA + KTRSEN IF( OK ) $ WRITE( NOUT, FMT = 9995 )PATH, NTESTS * @@ -169,6 +183,12 @@ $ / ' Safe minimum (SFMIN) = ', D16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9970 FORMAT( 'Error in ZTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9971 FORMAT( 'Error in ZTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9972 FORMAT( 'ZTRSYL and ZTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') RETURN * * End of ZCHKEC diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f index 336514a3f..71f1d6371 100644 --- a/lapack-netlib/TESTING/EIG/zdrvsg.f +++ b/lapack-netlib/TESTING/EIG/zdrvsg.f @@ -663,8 +663,8 @@ IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*DLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*DLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/zerrec.f b/lapack-netlib/TESTING/EIG/zerrec.f index dc6129da9..e1938f57d 100644 --- a/lapack-netlib/TESTING/EIG/zerrec.f +++ b/lapack-netlib/TESTING/EIG/zerrec.f @@ -23,7 +23,7 @@ *> *> ZERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> ZTRSYL, ZTREXC, ZTRSNA and ZTRSEN. +*> ZTRSYL, ZTRSYL3, ZTREXC, ZTRSNA and ZTRSEN. *> \endverbatim * * Arguments: @@ -77,7 +77,7 @@ * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ) + DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX*16 A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. @@ -141,6 +141,43 @@ CALL CHKXER( 'ZTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test ZTRSYL3 +* + SRNAMT = 'ZTRSYL3' + INFOT = 1 + CALL ZTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test ZTREXC * SRNAMT = 'ZTREXC' diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f index d1219c02b..1876c1f1d 100644 --- a/lapack-netlib/TESTING/EIG/zerred.f +++ b/lapack-netlib/TESTING/EIG/zerred.f @@ -100,7 +100,7 @@ * .. * .. External Subroutines .. EXTERNAL CHKXER, ZGEES, ZGEESX, ZGEEV, ZGEEVX, ZGESVJ, - $ ZGESDD, ZGESVD, ZGESVDX, ZGESVQ + $ ZGESDD, ZGESVD, ZGESVDX, ZGESVDQ * .. * .. External Functions .. LOGICAL LSAMEN, ZSLECT diff --git a/lapack-netlib/TESTING/EIG/zget37.f b/lapack-netlib/TESTING/EIG/zget37.f index 63680e855..5013fbdd9 100644 --- a/lapack-netlib/TESTING/EIG/zget37.f +++ b/lapack-netlib/TESTING/EIG/zget37.f @@ -265,7 +265,7 @@ 100 CONTINUE WSRT( KMIN ) = WSRT( I ) WSRT( I ) = VMIN - VCMIN = WTMP( I ) + VCMIN = DBLE( WTMP( I ) ) WTMP( I ) = W( KMIN ) WTMP( KMIN ) = VCMIN VMIN = STMP( KMIN ) diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f new file mode 100644 index 000000000..329f39dc4 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zsyl01.f @@ -0,0 +1,294 @@ +*> \brief \b ZSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZSYL01 tests ZTRSYL and ZTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements ZGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual ZTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual ZTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times ZTRSYL3 and ZTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of ZTRSYL +*> RMAX(2) = Value of the largest test ratio of ZTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times ZTRSYL returns an expected INFO +*> NINFO(2) = No. of times ZTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D+0 ) ) + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 185, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX*16 RMUL +* .. +* .. Local Arrays .. + COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ) + DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL DISNAN, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZLATMR, ZLACPY, ZGEMM, ZTRSYL, ZTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.05D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 51 + KLA = 0 + KUA = M - 1 + CALL ZLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = ZLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL ZLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = ZLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL ZLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL ZTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of ZSYL01 +* + END diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index 2cc0fba06..f0423a23b 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -608,17 +608,18 @@ ELSE IF( LSAMEN( 2, P2, 'LS' ) ) THEN * * LS: Least Squares driver routines for -* LS, LSD, LSS, LSX and LSY. +* LS, LST, TSLS, LSD, LSS, LSX and LSY. * WRITE( IOUNIT, FMT = 9984 )PATH WRITE( IOUNIT, FMT = 9967 ) - WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1 + WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1, C1, C1 WRITE( IOUNIT, FMT = 9935 )1 WRITE( IOUNIT, FMT = 9931 )2 - WRITE( IOUNIT, FMT = 9933 )3 - WRITE( IOUNIT, FMT = 9935 )4 - WRITE( IOUNIT, FMT = 9934 )5 - WRITE( IOUNIT, FMT = 9932 )6 + WRITE( IOUNIT, FMT = 9919 ) + WRITE( IOUNIT, FMT = 9933 )7 + WRITE( IOUNIT, FMT = 9935 )8 + WRITE( IOUNIT, FMT = 9934 )9 + WRITE( IOUNIT, FMT = 9932 )10 WRITE( IOUNIT, FMT = 9920 ) WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) * @@ -1048,10 +1049,11 @@ $ 'check if X is in the row space of A or A'' ', $ '(overdetermined case)' ) 9929 FORMAT( ' Test ratios (1-3: ', A1, 'TZRZF):' ) - 9920 FORMAT( 3X, ' 7-10: same as 3-6', 3X, ' 11-14: same as 3-6' ) - 9921 FORMAT( ' Test ratios:', / ' (1-2: ', A1, 'GELS, 3-6: ', A1, - $ 'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: ', - $ A1, 'GETSLS)') + 9919 FORMAT( 3X, ' 3-4: same as 1-2', 3X, ' 5-6: same as 1-2' ) + 9920 FORMAT( 3X, ' 11-14: same as 7-10', 3X, ' 15-18: same as 7-10' ) + 9921 FORMAT( ' Test ratios:', / ' (1-2: ', A1, 'GELS, 3-4: ', A1, + $ 'GELST, 5-6: ', A1, 'GETSLS, 7-10: ', A1, 'GELSY, 11-14: ', + $ A1, 'GETSS, 15-18: ', A1, 'GELSD)' ) 9928 FORMAT( 7X, 'where ALPHA = ( 1 + SQRT( 17 ) ) / 8' ) 9927 FORMAT( 3X, I2, ': ABS( Largest element in L )', / 12X, $ ' - ( 1 / ( 1 - ALPHA ) ) + THRESH' ) diff --git a/lapack-netlib/TESTING/LIN/cchkpt.f b/lapack-netlib/TESTING/LIN/cchkpt.f index 2ec802064..7dc367eeb 100644 --- a/lapack-netlib/TESTING/LIN/cchkpt.f +++ b/lapack-netlib/TESTING/LIN/cchkpt.f @@ -319,15 +319,15 @@ * elements. * IF( IZERO.EQ.1 ) THEN - D( 1 ) = Z( 2 ) + D( 1 ) = REAL( Z( 2 ) ) IF( N.GT.1 ) $ E( 1 ) = Z( 3 ) ELSE IF( IZERO.EQ.N ) THEN E( N-1 ) = Z( 1 ) - D( N ) = Z( 2 ) + D( N ) = REAL( Z( 2 ) ) ELSE E( IZERO-1 ) = Z( 1 ) - D( IZERO ) = Z( 2 ) + D( IZERO ) = REAL( Z( 2 ) ) E( IZERO ) = Z( 3 ) END IF END IF diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index ce1ecf761..4b09361d8 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS +*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -210,9 +210,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04, - $ CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS, - $ CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI, - $ CTRTRS, XLAENV + $ CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR, + $ CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03, + $ CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Complex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = REAL( A( 1 ) ) * CALL CTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B. +* + SRNAMT = 'CLATRS3' + CALL CCOPY( N, X, 1, B, 1 ) + CALL CCOPY( N, X, 1, B( N+1 ), 1 ) + CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from CLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL CSSCAL( N, BIGNUM, X, 1 ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cdrvgt.f b/lapack-netlib/TESTING/LIN/cdrvgt.f index 8d43f640f..acfbbcfa1 100644 --- a/lapack-netlib/TESTING/LIN/cdrvgt.f +++ b/lapack-netlib/TESTING/LIN/cdrvgt.f @@ -307,16 +307,16 @@ IZERO = 0 ELSE IF( IMAT.EQ.8 ) THEN IZERO = 1 - Z( 2 ) = A( N ) + Z( 2 ) = REAL( A( N ) ) A( N ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = A( 1 ) + Z( 3 ) = REAL( A( 1 ) ) A( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N - Z( 1 ) = A( 3*N-2 ) - Z( 2 ) = A( 2*N-1 ) + Z( 1 ) = REAL( A( 3*N-2 ) ) + Z( 2 ) = REAL( A( 2*N-1 ) ) A( 3*N-2 ) = ZERO A( 2*N-1 ) = ZERO ELSE diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f index 7fe189e5f..ecba705d5 100644 --- a/lapack-netlib/TESTING/LIN/cdrvls.f +++ b/lapack-netlib/TESTING/LIN/cdrvls.f @@ -31,7 +31,8 @@ *> *> \verbatim *> -*> CDRVLS tests the least squares driver routines CGELS, CGETSLS, CGELSS, CGELSY +*> CDRVLS tests the least squares driver routines CGELS, CGELST, +*> CGETSLS, CGELSS, CGELSY *> and CGELSD. *> \endverbatim * @@ -211,7 +212,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) REAL ONE, ZERO @@ -228,8 +229,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, LRWORK, - $ LWORK_CGELS, LWORK_CGETSLS, LWORK_CGELSS, - $ LWORK_CGELSY, LWORK_CGELSD, + $ LWORK_CGELS, LWORK_CGELST, LWORK_CGETSLS, + $ LWORK_CGELSS, LWORK_CGELSY, LWORK_CGELSD, $ LRWORK_CGELSY, LRWORK_CGELSS, LRWORK_CGELSD REAL EPS, NORMA, NORMB, RCOND * .. @@ -249,7 +250,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, CERRLS, CGELS, CGELSD, - $ CGELSS, CGELSY, CGEMM, CGETSLS, CLACPY, + $ CGELSS, CGELST, CGELSY, CGEMM, CGETSLS, CLACPY, $ CLARNV, CQRT13, CQRT15, CQRT16, CSSCAL, $ SAXPY, XLAENV * .. @@ -334,7 +335,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -361,6 +363,10 @@ CALL CGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_CGELS = INT( WQ( 1 ) ) +* Compute workspace needed for CGELST + CALL CGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_CGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for CGETSLS CALL CGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -425,21 +431,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 100 -* +* ===================================================== +* Begin test CGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test CGELS -* * Generate a matrix of scaling type ISCALE * CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -484,15 +495,20 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for CGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL CQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, RWORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for CGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN @@ -515,7 +531,7 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -524,26 +540,34 @@ $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test CGETSLS + END DO + END DO + END IF +* ===================================================== +* End test CGELS +* ===================================================== +* ===================================================== +* Begin test CGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + CALL XLAENV( 3, NXVAL( INB ) ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -560,9 +584,9 @@ IF( NCOLS.GT.0 ) THEN CALL CLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) - CALL CSCAL( NCOLS*NRHS, - $ CONE / REAL( NCOLS ), WORK, - $ 1 ) + CALL CSSCAL( NCOLS*NRHS, + $ ONE / REAL( NCOLS ), WORK, + $ 1 ) END IF CALL CGEMM( TRANS, 'No transpose', NROWS, $ NRHS, NCOLS, CONE, COPYA, LDA, @@ -578,31 +602,37 @@ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'CGETSLS ' - CALL CGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'CGELST' + CALL CGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) +* IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'CGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'CGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for CGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL CQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK2, - $ RESULT( 15 ) ) + $ LDA, B, LDB, C, LDB, RWORK, + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for CGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * * Solving LS system * - RESULT( 16 ) = CQRT17( TRANS, 1, M, N, + RESULT( 4 ) = CQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -610,7 +640,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = CQRT14( TRANS, M, N, + RESULT( 4 ) = CQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -618,21 +648,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 )TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test CGELST +* ===================================================== +* ===================================================== +* Begin test CGELSTSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO INB = 1, NNB + MB = NBVAL( INB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO IMB = 1, NNB + NB = NBVAL( IMB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'C' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL CLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL CSCAL( NCOLS*NRHS, + $ CONE / REAL( NCOLS ), + $ WORK, 1 ) + END IF + CALL CGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, CONE, + $ COPYA, LDA, WORK, LDWORK, + $ CZERO, B, LDB ) + CALL CLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL CLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL CLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'CGETSLS ' + CALL CGETSLS( TRANS, M, N, NRHS, A, + $ LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CGETSLS ', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for CGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL CLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL CQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK2, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for CGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = CQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = CQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, B, + $ LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 )TRANS, + $ M, N, NRHS, MB, NB, ITYPE, K, + $ RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test CGELSTSLS +* ==================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -680,37 +840,37 @@ * * workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = CQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = CQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK, RWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 4 ) ) + $ RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -736,38 +896,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 8 ) ) + $ RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -792,45 +952,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 12 ) ) + $ RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 80 K = 3, 14 + DO 80 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/cerrls.f b/lapack-netlib/TESTING/LIN/cerrls.f index 48e44ad86..fca943918 100644 --- a/lapack-netlib/TESTING/LIN/cerrls.f +++ b/lapack-netlib/TESTING/LIN/cerrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> CERRLS tests the error exits for the COMPLEX least squares -*> driver routines (CGELS, CGELSS, CGELSY, CGELSD). +*> driver routines (CGELS, CGELST, CGETSLS, CGELSS, CGELSY, CGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CGELS, CGELSD, CGELSS, CGELSY, CHKXER + EXTERNAL ALAESM, CHKXER, CGELS, CGELSD, CGELSS, CGELST, + $ CGELSY, CGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL CGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK ) * +* CGELST +* + SRNAMT = 'CGELST' + INFOT = 1 + CALL CGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) +* +* CGETSLS +* + SRNAMT = 'CGETSLS' + INFOT = 1 + CALL CGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) +* * CGELSS * SRNAMT = 'CGELSS' diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f index db65edd88..9ba784f62 100644 --- a/lapack-netlib/TESTING/LIN/cerrtr.f +++ b/lapack-netlib/TESTING/LIN/cerrtr.f @@ -82,9 +82,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, CTBCON, - $ CTBRFS, CTBTRS, CTPCON, CTPRFS, CTPTRI, CTPTRS, - $ CTRCON, CTRRFS, CTRTI2, CTRTRI, CTRTRS + EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, + $ CLATRS3, CTBCON, CTBRFS, CTBTRS, CTPCON, + $ CTPRFS, CTPTRI, CTPTRS, CTRCON, CTRRFS, CTRTI2, + $ CTRTRI, CTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ CALL CLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'CLATRS', INFOT, NOUT, LERR, OK ) * +* CLATRS3 +* + SRNAMT = 'CLATRS3' + INFOT = 1 + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/clattp.f b/lapack-netlib/TESTING/LIN/clattp.f index 82f0585df..a47a252ad 100644 --- a/lapack-netlib/TESTING/LIN/clattp.f +++ b/lapack-netlib/TESTING/LIN/clattp.f @@ -336,7 +336,7 @@ WORK( J+1 ) = PLUS2 WORK( N+J+1 ) = ZERO PLUS1 = STAR1 / PLUS2 - REXP = CLARND( 2, ISEED ) + REXP = REAL( CLARND( 2, ISEED ) ) IF( REXP.LT.ZERO ) THEN STAR1 = -SFAC**( ONE-REXP )*CLARND( 5, ISEED ) ELSE @@ -790,7 +790,7 @@ DO 460 J = 1, N / 2 JL = JJ DO 450 I = J, N - J - T = AP( JR-I+J ) + T = REAL( AP( JR-I+J ) ) AP( JR-I+J ) = AP( JL ) AP( JL ) = T JL = JL + I @@ -804,7 +804,7 @@ DO 480 J = 1, N / 2 JR = JJ DO 470 I = J, N - J - T = AP( JL+I-J ) + T = REAL( AP( JL+I-J ) ) AP( JL+I-J ) = AP( JR ) AP( JR ) = T JR = JR - I diff --git a/lapack-netlib/TESTING/LIN/cpbt01.f b/lapack-netlib/TESTING/LIN/cpbt01.f index 33c80666d..6145a1875 100644 --- a/lapack-netlib/TESTING/LIN/cpbt01.f +++ b/lapack-netlib/TESTING/LIN/cpbt01.f @@ -201,7 +201,8 @@ * * Compute the (K,K) element of the result. * - AKK = CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) + AKK = REAL( + $ CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) ) AFAC( KD+1, K ) = AKK * * Compute the rest of column K. @@ -228,7 +229,7 @@ * * Scale column K by the diagonal element. * - AKK = AFAC( 1, K ) + AKK = REAL( AFAC( 1, K ) ) CALL CSSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 ) * 40 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cpot01.f b/lapack-netlib/TESTING/LIN/cpot01.f index 00e195dd6..fbcf65086 100644 --- a/lapack-netlib/TESTING/LIN/cpot01.f +++ b/lapack-netlib/TESTING/LIN/cpot01.f @@ -176,7 +176,7 @@ * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. @@ -224,7 +224,7 @@ 70 CONTINUE END IF * -* Compute norm( L*U - A ) / ( N * norm(A) * EPS ) +* Compute norm(L*U - A) / ( N * norm(A) * EPS ) * RESID = CLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK ) * diff --git a/lapack-netlib/TESTING/LIN/cppt01.f b/lapack-netlib/TESTING/LIN/cppt01.f index 3a761a4c7..f865ec779 100644 --- a/lapack-netlib/TESTING/LIN/cppt01.f +++ b/lapack-netlib/TESTING/LIN/cppt01.f @@ -178,7 +178,7 @@ * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) + TR = REAL( CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) ) AFAC( KC+K-1 ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/cpst01.f b/lapack-netlib/TESTING/LIN/cpst01.f index 26da4b394..03d25515d 100644 --- a/lapack-netlib/TESTING/LIN/cpst01.f +++ b/lapack-netlib/TESTING/LIN/cpst01.f @@ -219,7 +219,7 @@ * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/dchktr.f b/lapack-netlib/TESTING/LIN/dchktr.f index a4a1150c0..57e87326b 100644 --- a/lapack-netlib/TESTING/LIN/dchktr.f +++ b/lapack-netlib/TESTING/LIN/dchktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS +*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -198,13 +198,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DLAMCH, DUMMY, RCOND, + $ RCONDC, RCONDI, RCONDO, RES, SCALE * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DCOPY, DERRTR, DGET04, - $ DLACPY, DLARHS, DLATRS, DLATTR, DTRCON, DTRRFS, - $ DTRT01, DTRT02, DTRT03, DTRT05, DTRT06, DTRTRI, - $ DTRTRS, XLAENV + $ DLACPY, DLAMCH, DSCAL, DLARHS, DLATRS, DLATRS3, + $ DLATTR, DTRCON, DTRRFS, DTRT01, DTRT02, DTRT03, + $ DTRT05, DTRT06, DTRTRI, DTRTRS, XLAENV * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ * PATH( 1: 1 ) = 'Double precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'DLATRS3' + CALL DCOPY( N, X, 1, B, 1 ) + CALL DCOPY( N, X, 1, B( N+1 ), 1 ) + CALL DSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL DLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from DLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL DSCAL( N, BIGNUM, X, 1 ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +583,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'DLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +603,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/ddrvls.f b/lapack-netlib/TESTING/LIN/ddrvls.f index b64930c10..b3d07d67f 100644 --- a/lapack-netlib/TESTING/LIN/ddrvls.f +++ b/lapack-netlib/TESTING/LIN/ddrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> DDRVLS tests the least squares driver routines DGELS, DGETSLS, DGELSS, DGELSY, -*> and DGELSD. +*> DDRVLS tests the least squares driver routines DGELS, DGELST, +*> DGETSLS, DGELSS, DGELSY, and DGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) DOUBLE PRECISION ONE, TWO, ZERO @@ -225,8 +225,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, - $ LWORK_DGELS, LWORK_DGETSLS, LWORK_DGELSS, - $ LWORK_DGELSY, LWORK_DGELSD + $ LWORK_DGELS, LWORK_DGELST, LWORK_DGETSLS, + $ LWORK_DGELSS, LWORK_DGELSY, LWORK_DGELSD DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. @@ -243,12 +243,12 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, DERRLS, DGELS, - $ DGELSD, DGELSS, DGELSY, DGEMM, DLACPY, - $ DLARNV, DLASRT, DQRT13, DQRT15, DQRT16, DSCAL, - $ XLAENV + $ DGELSD, DGELSS, DGELST, DGELSY, DGEMM, + $ DGETSLS, DLACPY, DLARNV, DQRT13, DQRT15, + $ DQRT16, DSCAL, XLAENV * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, INT, LOG, MAX, MIN, SQRT + INTRINSIC DBLE, INT, MAX, MIN, SQRT * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -330,7 +330,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -357,6 +358,10 @@ CALL DGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_DGELS = INT ( WQ ( 1 ) ) +* Compute workspace needed for DGELST + CALL DGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_DGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for DGETSLS CALL DGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -378,9 +383,9 @@ * Compute LIWORK workspace needed for DGELSY and DGELSD LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGETSLS, - $ LWORK_DGELSY, LWORK_DGELSS, - $ LWORK_DGELSD ) + LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGELST, + $ LWORK_DGETSLS, LWORK_DGELSY, + $ LWORK_DGELSS, LWORK_DGELSD ) END IF ENDDO ENDDO @@ -411,21 +416,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 110 -* +* ===================================================== +* Begin test DGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test DGELS -* * Generate a matrix of scaling type ISCALE * CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -469,20 +479,27 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for DGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL DQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for DGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * RESULT( 2 ) = DQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, @@ -500,35 +517,42 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9999 )TRANS, M, + WRITE( NOUT, FMT = 9999 ) TRANS, M, $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test DGETSLS + END DO + END DO + END IF +* ===================================================== +* End test DGELS +* ===================================================== +* ===================================================== +* Begin test DGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -563,31 +587,38 @@ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'DGETSLS ' - CALL DGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'DGELST' + CALL DGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'DGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'DGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for DGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL DQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, - $ RESULT( 15 ) ) + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for DGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * - RESULT( 16 ) = DQRT17( TRANS, 1, M, N, + RESULT( 4 ) = DQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -595,7 +626,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = DQRT14( TRANS, M, N, + RESULT( 4 ) = DQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -603,21 +634,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 ) TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test DGELST +* ===================================================== +* ===================================================== +* Begin test DGETSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO IMB = 1, NNB + MB = NBVAL( IMB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'T' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL DLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL DSCAL( NCOLS*NRHS, + $ ONE / DBLE( NCOLS ), + $ WORK, 1 ) + END IF + CALL DGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, ONE, + $ COPYA, LDA, WORK, LDWORK, + $ ZERO, B, LDB ) + CALL DLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL DLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL DLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'DGETSLS' + CALL DGETSLS( TRANS, M, N, NRHS, + $ A, LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DGETSLS', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for DGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL DLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL DQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for DGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = DQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = DQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, + $ B, LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 ) TRANS, + $ M, N, NRHS, MB, NB, ITYPE, + $ K, RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test DGETSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -662,37 +823,37 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = DQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = DQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 4 ) ) + $ WORK( M*NRHS+1 ), RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -716,38 +877,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 8 ) ) + $ WORK( M*NRHS+1 ), RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -776,45 +937,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 12 ) ) + $ WORK( M*NRHS+1 ), RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 90 K = 3, 14 + DO 90 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -826,6 +987,12 @@ NRUN = NRUN + 12 * 100 CONTINUE + + + + + + 110 CONTINUE 120 CONTINUE 130 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/derrls.f b/lapack-netlib/TESTING/LIN/derrls.f index a1f74dec2..09d745238 100644 --- a/lapack-netlib/TESTING/LIN/derrls.f +++ b/lapack-netlib/TESTING/LIN/derrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> DERRLS tests the error exits for the DOUBLE PRECISION least squares -*> driver routines (DGELS, SGELSS, SGELSY, SGELSD). +*> driver routines (DGELS, DGELST, DGETSLS, SGELSS, SGELSY, SGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELSY + EXTERNAL ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELST, + $ DGELSY, DGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL DGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK ) * +* DGELST +* + SRNAMT = 'DGELST' + INFOT = 1 + CALL DGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) +* +* DGETSLS +* + SRNAMT = 'DGETSLS' + INFOT = 1 + CALL DGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) +* * DGELSS * SRNAMT = 'DGELSS' diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f index a667f0d2b..d0580497d 100644 --- a/lapack-netlib/TESTING/LIN/derrtr.f +++ b/lapack-netlib/TESTING/LIN/derrtr.f @@ -83,9 +83,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, DTBCON, - $ DTBRFS, DTBTRS, DTPCON, DTPRFS, DTPTRI, DTPTRS, - $ DTRCON, DTRRFS, DTRTI2, DTRTRI, DTRTRS + EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, + $ DLATRS3, DTBCON, DTBRFS, DTBTRS, DTPCON, + $ DTPRFS, DTPTRI, DTPTRS, DTRCON, DTRRFS, + $ DTRTI2, DTRTRI, DTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ INFOT = 7 CALL DLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'DLATRS', INFOT, NOUT, LERR, OK ) +* +* DLATRS3 +* + SRNAMT = 'DLATRS3' + INFOT = 1 + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f index 66fa0bee7..92d876108 100644 --- a/lapack-netlib/TESTING/LIN/schktr.f +++ b/lapack-netlib/TESTING/LIN/schktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS +*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -198,13 +198,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, SCOPY, SERRTR, SGET04, - $ SLACPY, SLARHS, SLATRS, SLATTR, STRCON, STRRFS, - $ STRT01, STRT02, STRT03, STRT05, STRT06, STRTRI, - $ STRTRS, XLAENV + $ SLACPY, SLARHS, SLATRS, SLATRS3, SLATTR, SSCAL, + $ STRCON, STRRFS, STRT01, STRT02, STRT03, STRT05, + $ STRT06, STRTRI, STRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ * PATH( 1: 1 ) = 'Single precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,33 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'SLATRS3' + CALL SCOPY( N, X, 1, B, 1 ) + CALL SCOPY( N, X, 1, B( N+1 ), 1 ) + CALL SSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL SLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from SLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) +* + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL SSCAL( N, BIGNUM, X, 1 ) + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +584,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'SLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +604,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/sdrvls.f b/lapack-netlib/TESTING/LIN/sdrvls.f index b96451503..2baf9a3fb 100644 --- a/lapack-netlib/TESTING/LIN/sdrvls.f +++ b/lapack-netlib/TESTING/LIN/sdrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> SDRVLS tests the least squares driver routines SGELS, SGETSLS, SGELSS, SGELSY, -*> and SGELSD. +*> SDRVLS tests the least squares driver routines SGELS, SGELST, +*> SGETSLS, SGELSS, SGELSY and SGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) REAL ONE, TWO, ZERO @@ -225,8 +225,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, - $ LWORK_SGELS, LWORK_SGETSLS, LWORK_SGELSS, - $ LWORK_SGELSY, LWORK_SGELSD + $ LWORK_SGELS, LWORK_SGELST, LWORK_SGETSLS, + $ LWORK_SGELSS, LWORK_SGELSY, LWORK_SGELSD REAL EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. @@ -243,12 +243,12 @@ * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, SAXPY, SERRLS, SGELS, - $ SGELSD, SGELSS, SGELSY, SGEMM, SLACPY, - $ SLARNV, SQRT13, SQRT15, SQRT16, SSCAL, - $ XLAENV, SGETSLS + $ SGELSD, SGELSS, SGELST, SGELSY, SGEMM, + $ SGETSLS, SLACPY, SLARNV, SQRT13, SQRT15, + $ SQRT16, SSCAL, XLAENV * .. * .. Intrinsic Functions .. - INTRINSIC INT, LOG, MAX, MIN, REAL, SQRT + INTRINSIC INT, MAX, MIN, REAL, SQRT * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -330,7 +330,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -357,6 +358,10 @@ CALL SGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ( 1 ), -1, INFO ) LWORK_SGELS = INT ( WQ( 1 ) ) +* Compute workspace needed for SGELST + CALL SGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_SGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for SGETSLS CALL SGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ( 1 ), -1, INFO ) @@ -378,9 +383,9 @@ * Compute LIWORK workspace needed for SGELSY and SGELSD LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGETSLS, - $ LWORK_SGELSY, LWORK_SGELSS, - $ LWORK_SGELSD ) + LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGELST, + $ LWORK_SGETSLS, LWORK_SGELSY, + $ LWORK_SGELSS, LWORK_SGELSD ) END IF ENDDO ENDDO @@ -411,21 +416,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 110 -* +* ===================================================== +* Begin test SGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test SGELS -* * Generate a matrix of scaling type ISCALE * CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -469,20 +479,27 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for SGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL SQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for SGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * RESULT( 2 ) = SQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, @@ -500,7 +517,7 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -509,26 +526,33 @@ $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test SGETSLS + END DO + END DO + END IF +* ===================================================== +* End test SGELS +* ===================================================== +* ===================================================== +* Begin test SGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) -* - DO 60 ITRAN = 1, 2 +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -563,31 +587,38 @@ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'SGETSLS ' - CALL SGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'SGELST' + CALL SGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'SGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'SGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for SGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL SQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, - $ RESULT( 15 ) ) + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for SGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * - RESULT( 16 ) = SQRT17( TRANS, 1, M, N, + RESULT( 4 ) = SQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -595,7 +626,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = SQRT14( TRANS, M, N, + RESULT( 4 ) = SQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -603,21 +634,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 ) TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO END IF +* ===================================================== +* End test SGELST +* ===================================================== +* ===================================================== +* Begin test SGETSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO IMB = 1, NNB + MB = NBVAL( IMB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'T' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL SLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL SSCAL( NCOLS*NRHS, + $ ONE / REAL( NCOLS ), + $ WORK, 1 ) + END IF + CALL SGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, ONE, + $ COPYA, LDA, WORK, LDWORK, + $ ZERO, B, LDB ) + CALL SLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL SLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL SLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'SGETSLS' + CALL SGETSLS( TRANS, M, N, NRHS, + $ A, LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SGETSLS', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for SGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL SLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL SQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for SGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = SQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = SQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, + $ B, LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 ) TRANS, + $ M, N, NRHS, MB, NB, ITYPE, + $ K, RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO + END IF +* ===================================================== +* End test SGETSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -662,37 +823,37 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = SQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = SQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 4 ) ) + $ WORK( M*NRHS+1 ), RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -716,38 +877,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 8 ) ) + $ WORK( M*NRHS+1 ), RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -776,45 +937,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 12 ) ) + $ WORK( M*NRHS+1 ), RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 90 K = 3, 14 + DO 90 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/serrls.f b/lapack-netlib/TESTING/LIN/serrls.f index e6ee4360f..6c4820066 100644 --- a/lapack-netlib/TESTING/LIN/serrls.f +++ b/lapack-netlib/TESTING/LIN/serrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> SERRLS tests the error exits for the REAL least squares -*> driver routines (SGELS, SGELSS, SGELSY, SGELSD). +*> driver routines (SGELS, SGELST, SGETSLS, SGELSS, SGELSY, SGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELSY + EXTERNAL ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELST, + $ SGELSY, SGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL SGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK ) * +* SGELST +* + SRNAMT = 'SGELST' + INFOT = 1 + CALL SGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) +* +* SGETSLS +* + SRNAMT = 'SGETSLS' + INFOT = 1 + CALL SGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) +* * SGELSS * SRNAMT = 'SGELSS' diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f index f0d0a0ef2..af1ce0a8e 100644 --- a/lapack-netlib/TESTING/LIN/serrtr.f +++ b/lapack-netlib/TESTING/LIN/serrtr.f @@ -83,9 +83,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, STBCON, - $ STBRFS, STBTRS, STPCON, STPRFS, STPTRI, STPTRS, - $ STRCON, STRRFS, STRTI2, STRTRI, STRTRS + EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, + $ SLATRS3, STBCON, STBRFS, STBTRS, STPCON, + $ STPRFS, STPTRI, STPTRS, STRCON, STRRFS, STRTI2, + $ STRTRI, STRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ INFOT = 7 CALL SLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'SLATRS', INFOT, NOUT, LERR, OK ) +* +* SLATRS3 +* + SRNAMT = 'SLATRS3' + INFOT = 1 + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/zchkpt.f b/lapack-netlib/TESTING/LIN/zchkpt.f index 80e1690a7..11089d2a1 100644 --- a/lapack-netlib/TESTING/LIN/zchkpt.f +++ b/lapack-netlib/TESTING/LIN/zchkpt.f @@ -319,15 +319,15 @@ * elements. * IF( IZERO.EQ.1 ) THEN - D( 1 ) = Z( 2 ) + D( 1 ) = DBLE( Z( 2 ) ) IF( N.GT.1 ) $ E( 1 ) = Z( 3 ) ELSE IF( IZERO.EQ.N ) THEN E( N-1 ) = Z( 1 ) - D( N ) = Z( 2 ) + D( N ) = DBLE( Z( 2 ) ) ELSE E( IZERO-1 ) = Z( 1 ) - D( IZERO ) = Z( 2 ) + D( IZERO ) = DBLE( Z( 2 ) ) E( IZERO ) = Z( 3 ) END IF END IF diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 0a6f47b1e..275ca2857 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS +*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -195,13 +195,13 @@ CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, DLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -209,10 +209,10 @@ EXTERNAL LSAME, ZLANTR * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR, - $ ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON, - $ ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06, - $ ZTRTRI, ZTRTRS + EXTERNAL ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY, + $ ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS, + $ ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01, + $ ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ * PATH( 1: 1 ) = 'Zomplex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = DBLE( A( 1 ) ) * CALL ZTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'ZLATRS3' + CALL ZCOPY( N, X, 1, B, 1 ) + CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) + CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from ZLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL ZDSCAL( N, BIGNUM, X, 1 ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -565,8 +599,8 @@ 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/zdrvgt.f b/lapack-netlib/TESTING/LIN/zdrvgt.f index d055e4bdb..b2e0f66b1 100644 --- a/lapack-netlib/TESTING/LIN/zdrvgt.f +++ b/lapack-netlib/TESTING/LIN/zdrvgt.f @@ -307,16 +307,16 @@ IZERO = 0 ELSE IF( IMAT.EQ.8 ) THEN IZERO = 1 - Z( 2 ) = A( N ) + Z( 2 ) = DBLE( A( N ) ) A( N ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = A( 1 ) + Z( 3 ) = DBLE( A( 1 ) ) A( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N - Z( 1 ) = A( 3*N-2 ) - Z( 2 ) = A( 2*N-1 ) + Z( 1 ) = DBLE( A( 3*N-2 ) ) + Z( 2 ) = DBLE( A( 2*N-1 ) ) A( 3*N-2 ) = ZERO A( 2*N-1 ) = ZERO ELSE diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f index 2eab97905..b21345d30 100644 --- a/lapack-netlib/TESTING/LIN/zdrvls.f +++ b/lapack-netlib/TESTING/LIN/zdrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> ZDRVLS tests the least squares driver routines ZGELS, ZGETSLS, ZGELSS, ZGELSY -*> and ZGELSD. +*> ZDRVLS tests the least squares driver routines ZGELS, ZGELST, +*> ZGETSLS, ZGELSS, ZGELSY and ZGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) DOUBLE PRECISION ONE, ZERO @@ -228,8 +228,8 @@ $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, LRWORK, - $ LWORK_ZGELS, LWORK_ZGETSLS, LWORK_ZGELSS, - $ LWORK_ZGELSY, LWORK_ZGELSD, + $ LWORK_ZGELS, LWORK_ZGELST, LWORK_ZGETSLS, + $ LWORK_ZGELSS, LWORK_ZGELSY, LWORK_ZGELSD, $ LRWORK_ZGELSY, LRWORK_ZGELSS, LRWORK_ZGELSD DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. @@ -248,10 +248,10 @@ EXTERNAL DASUM, DLAMCH, ZQRT12, ZQRT14, ZQRT17 * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, DLASRT, XLAENV, - $ ZDSCAL, ZERRLS, ZGELS, ZGELSD, ZGELSS, - $ ZGELSY, ZGEMM, ZLACPY, ZLARNV, ZQRT13, ZQRT15, - $ ZQRT16, ZGETSLS + EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, ZERRLS, ZGELS, + $ ZGELSD, ZGELSS, ZGELST, ZGELSY, ZGEMM, + $ ZGETSLS, ZLACPY, ZLARNV, ZQRT13, ZQRT15, + $ ZQRT16, ZDSCAL, XLAENV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, INT, SQRT @@ -334,7 +334,8 @@ LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -361,6 +362,10 @@ CALL ZGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_ZGELS = INT ( WQ( 1 ) ) +* Compute workspace needed for ZGELST + CALL ZGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_ZGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for ZGETSLS CALL ZGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -390,9 +395,9 @@ LRWORK = MAX( LRWORK, LRWORK_ZGELSY, $ LRWORK_ZGELSS, LRWORK_ZGELSD ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGETSLS, - $ LWORK_ZGELSY, LWORK_ZGELSS, - $ LWORK_ZGELSD ) + LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGELST, + $ LWORK_ZGETSLS, LWORK_ZGELSY, + $ LWORK_ZGELSS, LWORK_ZGELSD ) END IF ENDDO ENDDO @@ -425,21 +430,26 @@ ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 100 -* +* ===================================================== +* Begin test ZGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test ZGELS -* * Generate a matrix of scaling type ISCALE * CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -484,15 +494,20 @@ $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for ZGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL ZQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, RWORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for ZGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN @@ -515,7 +530,7 @@ * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -524,26 +539,34 @@ $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test ZGETSLS + END DO + END DO + END IF +* ===================================================== +* End test ZGELS +* ===================================================== +* ===================================================== +* Begin test ZGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + CALL XLAENV( 3, NXVAL( INB ) ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -560,9 +583,9 @@ IF( NCOLS.GT.0 ) THEN CALL ZLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) - CALL ZSCAL( NCOLS*NRHS, - $ CONE / DBLE( NCOLS ), WORK, - $ 1 ) + CALL ZDSCAL( NCOLS*NRHS, + $ ONE / DBLE( NCOLS ), WORK, + $ 1 ) END IF CALL ZGEMM( TRANS, 'No transpose', NROWS, $ NRHS, NCOLS, CONE, COPYA, LDA, @@ -578,31 +601,37 @@ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'ZGETSLS ' - CALL ZGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'ZGELST' + CALL ZGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) +* IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'ZGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'ZGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for ZGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL ZQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK2, - $ RESULT( 15 ) ) + $ LDA, B, LDB, C, LDB, RWORK, + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for ZGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * * Solving LS system * - RESULT( 16 ) = ZQRT17( TRANS, 1, M, N, + RESULT( 4 ) = ZQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -610,7 +639,7 @@ * * Solving overdetermined system * - RESULT( 16 ) = ZQRT14( TRANS, M, N, + RESULT( 4 ) = ZQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -618,21 +647,151 @@ * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 )TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test ZGELST +* ===================================================== +* ===================================================== +* Begin test ZGELSTSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO INB = 1, NNB + MB = NBVAL( INB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO IMB = 1, NNB + NB = NBVAL( IMB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'C' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL ZLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL ZSCAL( NCOLS*NRHS, + $ CONE / DBLE( NCOLS ), + $ WORK, 1 ) + END IF + CALL ZGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, CONE, + $ COPYA, LDA, WORK, LDWORK, + $ CZERO, B, LDB ) + CALL ZLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL ZLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL ZLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'ZGETSLS ' + CALL ZGETSLS( TRANS, M, N, NRHS, A, + $ LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZGETSLS ', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for ZGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL ZLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL ZQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK2, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for ZGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = ZQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = ZQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, B, + $ LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 )TRANS, + $ M, N, NRHS, MB, NB, ITYPE, K, + $ RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test ZGELSTSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -680,37 +839,37 @@ * * workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = ZQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = ZQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK, RWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 4 ) ) + $ RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -736,38 +895,38 @@ * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 8 ) ) + $ RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -792,45 +951,45 @@ $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 12 ) ) + $ RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 80 K = 3, 14 + DO 80 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/zdrvpt.f b/lapack-netlib/TESTING/LIN/zdrvpt.f index 14a9f76ba..75f4d5738 100644 --- a/lapack-netlib/TESTING/LIN/zdrvpt.f +++ b/lapack-netlib/TESTING/LIN/zdrvpt.f @@ -266,12 +266,12 @@ * IA = 1 DO 20 I = 1, N - 1 - D( I ) = A( IA ) + D( I ) = DBLE( A( IA ) ) E( I ) = A( IA+1 ) IA = IA + 2 20 CONTINUE IF( N.GT.0 ) - $ D( N ) = A( IA ) + $ D( N ) = DBLE( A( IA ) ) ELSE * * Type 7-12: generate a diagonally dominant matrix with @@ -333,13 +333,13 @@ Z( 2 ) = D( 1 ) D( 1 ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = E( 1 ) + Z( 3 ) = DBLE( E( 1 ) ) E( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N IF( N.GT.1 ) THEN - Z( 1 ) = E( N-1 ) + Z( 1 ) = DBLE( E( N-1 ) ) E( N-1 ) = ZERO END IF Z( 2 ) = D( N ) @@ -347,9 +347,9 @@ ELSE IF( IMAT.EQ.10 ) THEN IZERO = ( N+1 ) / 2 IF( IZERO.GT.1 ) THEN - Z( 1 ) = E( IZERO-1 ) + Z( 1 ) = DBLE( E( IZERO-1 ) ) E( IZERO-1 ) = ZERO - Z( 3 ) = E( IZERO ) + Z( 3 ) = DBLE( E( IZERO ) ) E( IZERO ) = ZERO END IF Z( 2 ) = D( IZERO ) diff --git a/lapack-netlib/TESTING/LIN/zerrls.f b/lapack-netlib/TESTING/LIN/zerrls.f index 66e56c8c6..22f049ee0 100644 --- a/lapack-netlib/TESTING/LIN/zerrls.f +++ b/lapack-netlib/TESTING/LIN/zerrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> ZERRLS tests the error exits for the COMPLEX*16 least squares -*> driver routines (ZGELS, CGELSS, CGELSY, CGELSD). +*> driver routines (ZGELS, ZGELST, ZGETSLS, CGELSS, CGELSY, CGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELSY + EXTERNAL ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELST, + $ ZGELSY, ZGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ INFOT = 8 CALL ZGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK ) * +* ZGELST +* + SRNAMT = 'ZGELST' + INFOT = 1 + CALL ZGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) +* +* ZGETSLS +* + SRNAMT = 'ZGETSLS' + INFOT = 1 + CALL ZGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) +* * ZGELSS * SRNAMT = 'ZGELSS' diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f index 098040ace..211b92154 100644 --- a/lapack-netlib/TESTING/LIN/zerrtr.f +++ b/lapack-netlib/TESTING/LIN/zerrtr.f @@ -82,9 +82,10 @@ EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, ZTBCON, - $ ZTBRFS, ZTBTRS, ZTPCON, ZTPRFS, ZTPTRI, ZTPTRS, - $ ZTRCON, ZTRRFS, ZTRTI2, ZTRTRI, ZTRTRS + EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, + $ ZLATRS3, ZTBCON, ZTBRFS, ZTBTRS, ZTPCON, + $ ZTPRFS, ZTPTRI, ZTPTRS, ZTRCON, ZTRRFS, ZTRTI2, + $ ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ CALL ZLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'ZLATRS', INFOT, NOUT, LERR, OK ) * +* ZLATRS3 +* + SRNAMT = 'ZLATRS3' + INFOT = 1 + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/zlattp.f b/lapack-netlib/TESTING/LIN/zlattp.f index b728852b5..e05d9299e 100644 --- a/lapack-netlib/TESTING/LIN/zlattp.f +++ b/lapack-netlib/TESTING/LIN/zlattp.f @@ -336,7 +336,7 @@ WORK( J+1 ) = PLUS2 WORK( N+J+1 ) = ZERO PLUS1 = STAR1 / PLUS2 - REXP = ZLARND( 2, ISEED ) + REXP = DBLE( ZLARND( 2, ISEED ) ) IF( REXP.LT.ZERO ) THEN STAR1 = -SFAC**( ONE-REXP )*ZLARND( 5, ISEED ) ELSE @@ -790,7 +790,7 @@ DO 460 J = 1, N / 2 JL = JJ DO 450 I = J, N - J - T = AP( JR-I+J ) + T = DBLE( AP( JR-I+J ) ) AP( JR-I+J ) = AP( JL ) AP( JL ) = T JL = JL + I @@ -804,7 +804,7 @@ DO 480 J = 1, N / 2 JR = JJ DO 470 I = J, N - J - T = AP( JL+I-J ) + T = DBLE( AP( JL+I-J ) ) AP( JL+I-J ) = AP( JR ) AP( JR ) = T JR = JR - I diff --git a/lapack-netlib/TESTING/LIN/zpbt01.f b/lapack-netlib/TESTING/LIN/zpbt01.f index fb7881ac7..1801b66cf 100644 --- a/lapack-netlib/TESTING/LIN/zpbt01.f +++ b/lapack-netlib/TESTING/LIN/zpbt01.f @@ -201,7 +201,8 @@ * * Compute the (K,K) element of the result. * - AKK = ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) + AKK = DBLE( + $ ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) ) AFAC( KD+1, K ) = AKK * * Compute the rest of column K. @@ -228,7 +229,7 @@ * * Scale column K by the diagonal element. * - AKK = AFAC( 1, K ) + AKK = DBLE( AFAC( 1, K ) ) CALL ZDSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 ) * 40 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/zpot01.f b/lapack-netlib/TESTING/LIN/zpot01.f index d71445cd4..de83414c6 100644 --- a/lapack-netlib/TESTING/LIN/zpot01.f +++ b/lapack-netlib/TESTING/LIN/zpot01.f @@ -176,7 +176,7 @@ * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. @@ -224,7 +224,7 @@ 70 CONTINUE END IF * -* Compute norm( L*U - A ) / ( N * norm(A) * EPS ) +* Compute norm(L*U - A) / ( N * norm(A) * EPS ) * RESID = ZLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK ) * diff --git a/lapack-netlib/TESTING/LIN/zppt01.f b/lapack-netlib/TESTING/LIN/zppt01.f index 78ec595af..acaea50d2 100644 --- a/lapack-netlib/TESTING/LIN/zppt01.f +++ b/lapack-netlib/TESTING/LIN/zppt01.f @@ -178,7 +178,7 @@ * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) ) AFAC( KC+K-1 ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/zpst01.f b/lapack-netlib/TESTING/LIN/zpst01.f index 691857219..bed18c514 100644 --- a/lapack-netlib/TESTING/LIN/zpst01.f +++ b/lapack-netlib/TESTING/LIN/zpst01.f @@ -219,7 +219,7 @@ * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index fd4e57048..1d44e9490 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -39,8 +39,12 @@ set(UNIT_SOURCES2 trti2/trti2_L.c ) +if (NOT RELAPACK_REPLACE) GenerateNamedObjects("${LAPACK_SOURCES}") GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) +else() +GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) +endif() GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) @@ -113,4 +117,3 @@ GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) add_library(lapack OBJECT ${OPENBLAS_SRC}) - diff --git a/openblas.pc.in b/openblas.pc.in index ff849807c..8ad6e8bee 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -2,6 +2,6 @@ Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas +Libs: -L${libdir} -lopenblas${libsuffix} Libs.private: ${extralib} Cflags: -I${includedir} diff --git a/param.h b/param.h index b9b9a55e8..19cbe75a5 100644 --- a/param.h +++ b/param.h @@ -79,6 +79,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMM_DEFAULT_P 256 #define SBGEMM_DEFAULT_R 256 #define SBGEMM_DEFAULT_Q 256 +#define SBGEMM_ALIGN_K 1 // must be 2^x + #ifdef OPTERON #define SNUMOPT 4 @@ -3365,6 +3367,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEV1) +#define SWITCH_RATIO 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3394,6 +3398,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) +#undef SBGEMM_ALIGN_K +#define SBGEMM_ALIGN_K 4 + #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_N #define SBGEMM_DEFAULT_UNROLL_M 8 diff --git a/relapack/Makefile b/relapack/Makefile index ddf101bd1..056a0ee48 100644 --- a/relapack/Makefile +++ b/relapack/Makefile @@ -1,53 +1,61 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system - +ifeq ($(RELAPACK_REPLACE),0) +RELAPREFIX=RELAPACK_ +INCLALL=-DINCLUDE_ALL=0 +else +INCLALL=-DINCLUDE_ALL=1 +endif SRC = $(wildcard src/*.c) SRC1 = \ - src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \ - src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \ - src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \ - src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c + slauum.c clauum.c dlauum.c zlauum.c \ + strtri.c dtrtri.c ctrtri.c ztrtri.c \ + spotrf.c dpotrf.c cpotrf.c zpotrf.c \ + sgetrf.c dgetrf.c cgetrf.c zgetrf.c SRC2 = \ - src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ - src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ - src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ - src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ - src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ - src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \ - src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \ - src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \ - src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c + cgbtrf.c cpbtrf.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \ + cgemmt.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \ + csytrf.c dgemmt.c dsytrf_rook_rec2.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \ + chegst.c csytrf_rec2.c dtgsyl.c strsyl.c zhetrf_rook.c ztgsyl.c \ + chetrf.c csytrf_rook.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \ + chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c ztrsyl_rec2.c \ + chetrf_rook.c ctgsyl.c ssygst.c zgbtrf.c zpbtrf.c \ + chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c \ + ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zsytrf.c SRCX = \ - src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ - src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ - src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ - src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ - src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ - src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \ - src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \ - src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \ - src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c - -OBJS1 = $(SRC1:%.c=%.$(SUFFIX)) -OBJS2 = $(SRC2:%.c=%.o) + cgbtrf.c cpbtrf.c ctrtri.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \ + cgemmt.c cpotrf.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \ + cgetrf.c csytrf.c dgemmt.c dsytrf_rook_rec2.c sgetrf.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \ + chegst.c csytrf_rec2.c dgetrf.c dtgsyl.c slauum.c strsyl.c zhetrf_rook.c ztgsyl.c \ + chetrf.c csytrf_rook.c dlauum.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \ + chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c spotrf.c strtri.c zlauum.c ztrsyl_rec2.c \ + chetrf_rook.c ctgsyl.c dpotrf.c dtrtri.c ssygst.c zgbtrf.c zpbtrf.c ztrtri.c \ + chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c zpotrf.c \ + clauum.c ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zgetrf.c zsytrf.c + + +OBJS1 = $(SRC1:%.c=src/$(RELAPREFIX)%.$(SUFFIX)) +OBJS2 = $(SRC2:%.c=src/$(RELAPREFIX)%.o) OBJS = $(OBJS1) $(OBJS2) TEST_SUITS = \ - slauum dlauum clauum zlauum \ - spotrf dpotrf cpotrf zpotrf \ - spbtrf dpbtrf cpbtrf zpbtrf \ - ssygst dsygst chegst zhegst \ - ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ - sgetrf dgetrf cgetrf zgetrf \ - sgbtrf dgbtrf cgbtrf zgbtrf \ - strsyl dtrsyl ctrsyl ztrsyl \ - stgsyl dtgsyl ctgsyl ztgsyl \ sgemmt dgemmt cgemmt zgemmt + + # slauum dlauum clauum zlauum \ + # spotrf dpotrf cpotrf zpotrf \ + # spbtrf dpbtrf cpbtrf zpbtrf \ + # ssygst dsygst chegst zhegst \ + # ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ + # sgetrf dgetrf cgetrf zgetrf \ + # sgbtrf dgbtrf cgbtrf zgbtrf \ + # strsyl dtrsyl ctrsyl ztrsyl \ + # stgsyl dtgsyl ctgsyl ztgsyl \ + TESTS = $(TEST_SUITS:%=test/%.pass) # dummies TEST_EXES = $(TEST_SUITS:%=test/%.x) @@ -63,11 +71,11 @@ libs: $(OBJS) $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) $(RANLIB) $(TOPDIR)/$(LIBNAME) -%.$(SUFFIX): %.c config.h - $(CC) $(CFLAGS) -c $< -o $@ +src/$(RELAPREFIX)%.$(SUFFIX): src/%.c relapack_config.h + $(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@ -%.o: %.c config.h - $(CC) $(CFLAGS) -c $< -o $@ +src/$(RELAPREFIX)%.o: src/%.c relapack_config.h + $(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@ # ReLAPACK testing diff --git a/relapack/config.h b/relapack/relapack_config.h similarity index 99% rename from relapack/config.h rename to relapack/relapack_config.h index 9d6919463..ba428a61b 100644 --- a/relapack/config.h +++ b/relapack/relapack_config.h @@ -45,7 +45,7 @@ // The following macros specify which routines are included in the library under // LAPACK's symbol names: 1 included, 0 not included -#define INCLUDE_ALL 1 +// #define INCLUDE_ALL 1 #define INCLUDE_XLAUUM INCLUDE_ALL #define INCLUDE_SLAUUM INCLUDE_XLAUUM @@ -115,7 +115,7 @@ #define INCLUDE_CTGSYL INCLUDE_XTGSYL #define INCLUDE_ZTGSYL INCLUDE_XTGSYL -#define INCLUDE_XGEMMT 1 +#define INCLUDE_XGEMMT INCLUDE_ALL #define INCLUDE_SGEMMT INCLUDE_XGEMMT #define INCLUDE_DGEMMT INCLUDE_XGEMMT #define INCLUDE_CGEMMT INCLUDE_XGEMMT diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt index 2d861f54b..b92089418 100644 --- a/relapack/src/CMakeLists.txt +++ b/relapack/src/CMakeLists.txt @@ -1,85 +1,86 @@ -include_directories(${PROJECT_SOURCE_DIR}) -include_directories(${PROJECT_BINARY_DIR}) - -set(RELAFILES -clauum.c -ctrsyl_rec2.c -dsytrf.c -spbtrf.c -strsyl_rec2.c -zhetrf_rook_rec2.c -ztrsyl.c -cgbtrf.c -cpbtrf.c -ctrtri.c -dsytrf_rec2.c -spotrf.c -strtri.c -zlauum.c -ztrsyl_rec2.c -cgemmt.c -cpotrf.c -dgbtrf.c -dsytrf_rook.c -lapack_wrappers.c -ssygst.c -zgbtrf.c -zpbtrf.c -ztrtri.c -cgetrf.c -csytrf.c -dgemmt.c -dsytrf_rook_rec2.c -ssytrf.c -zgemmt.c -zpotrf.c -chegst.c -csytrf_rec2.c -dgetrf.c -dtgsyl.c -ssytrf_rec2.c -zgetrf.c -zsytrf.c -chetrf.c -csytrf_rook.c -dlauum.c -dtrsyl.c -sgbtrf.c -ssytrf_rook.c -zhegst.c -zsytrf_rec2.c -chetrf_rec2.c -csytrf_rook_rec2.c -dpbtrf.c -dtrsyl_rec2.c -sgemmt.c -ssytrf_rook_rec2.c -zhetrf.c -zsytrf_rook.c -chetrf_rook.c -ctgsyl.c -dpotrf.c -dtrtri.c -sgetrf.c -stgsyl.c -zhetrf_rec2.c -zsytrf_rook_rec2.c -chetrf_rook_rec2.c -ctrsyl.c -dsygst.c -f2c.c -slauum.c -strsyl.c -zhetrf_rook.c -ztgsyl.c -) - - - -# add relapack folder to the sources -set(RELA_SOURCES "") -foreach (RELA_FILE ${RELAFILES}) - list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") -endforeach () -add_library(relapack_src OBJECT ${RELA_SOURCES}) -set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) +include_directories(${PROJECT_SOURCE_DIR}/relapack) + +set(RELAFILES +clauum.c +ctrsyl_rec2.c +dsytrf.c +spbtrf.c +strsyl_rec2.c +zhetrf_rook_rec2.c +ztrsyl.c +cgbtrf.c +cpbtrf.c +ctrtri.c +dsytrf_rec2.c +spotrf.c +strtri.c +zlauum.c +ztrsyl_rec2.c +cgemmt.c +cpotrf.c +dgbtrf.c +dsytrf_rook.c +lapack_wrappers.c +ssygst.c +zgbtrf.c +zpbtrf.c +ztrtri.c +cgetrf.c +csytrf.c +dgemmt.c +dsytrf_rook_rec2.c +ssytrf.c +zgemmt.c +zpotrf.c +chegst.c +csytrf_rec2.c +dgetrf.c +dtgsyl.c +ssytrf_rec2.c +zgetrf.c +zsytrf.c +chetrf.c +csytrf_rook.c +dlauum.c +dtrsyl.c +sgbtrf.c +ssytrf_rook.c +zhegst.c +zsytrf_rec2.c +chetrf_rec2.c +csytrf_rook_rec2.c +dpbtrf.c +dtrsyl_rec2.c +sgemmt.c +ssytrf_rook_rec2.c +zhetrf.c +zsytrf_rook.c +chetrf_rook.c +ctgsyl.c +dpotrf.c +dtrtri.c +sgetrf.c +stgsyl.c +zhetrf_rec2.c +zsytrf_rook_rec2.c +chetrf_rook_rec2.c +ctrsyl.c +dsygst.c +f2c.c +slauum.c +strsyl.c +zhetrf_rook.c +ztgsyl.c +) + + + +# add relapack folder to the sources +set(RELA_SOURCES "") +foreach (RELA_FILE ${RELAFILES}) + list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") +endforeach () +add_library(relapack_src OBJECT ${RELA_SOURCES}) +set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c index 556491c7a..674d73709 100644 --- a/relapack/src/ctrsyl_rec2.c +++ b/relapack/src/ctrsyl_rec2.c @@ -10,7 +10,7 @@ http://www.netlib.org/f2c/libf2c.zip */ -#include "../config.h" +#include "relapack_config.h" #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h index 38c5c30d0..44652a074 100644 --- a/relapack/src/relapack.h +++ b/relapack/src/relapack.h @@ -1,7 +1,7 @@ #ifndef RELAPACK_INT_H #define RELAPACK_INT_H #include -#include "../../config.h" +#include "config.h" #if defined(OS_WINDOWS) && defined(__64BIT__) typedef long long BLASLONG; typedef unsigned long long BLASULONG; @@ -9,7 +9,7 @@ typedef unsigned long long BLASULONG; typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#include "../config.h" +#include "relapack_config.h" #include "../inc/relapack.h" diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c index edc6ffc6b..d07a4e8de 100644 --- a/relapack/src/ztrsyl_rec2.c +++ b/relapack/src/ztrsyl_rec2.c @@ -10,7 +10,7 @@ http://www.netlib.org/f2c/libf2c.zip */ -#include "../config.h" +#include "relapack_config.h" #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES